vendor/llvm/llvm-trunk-r240225

author: Dimitry Andric <dim@FreeBSD.org> 2015-06-21 13:59:01 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2015-06-21 13:59:01 +0000
commit: 3a0822f094b578157263e04114075ad7df81db41 (patch)
tree: bc48361fe2cd1ca5f93ac01b38b183774468fc79 /test/CodeGen/AMDGPU
parent: 85d8b2bbe386bcfe669575d05b61482d7be07e5d (diff)
400 files changed, 45380 insertions, 0 deletions
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
new file mode 100644
index 000000000000..c7bcfd2ddab2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -0,0 +1,139 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
+; the global address space(1) uses 64-bit pointers.  These tests check to make sure
+; the correct pointer size is used for the local address space.
+
+; The e{{32|64}} suffix on the instructions refers to the encoding size and not
+; the size of the operands.  The operand size is denoted in the instruction name.
+; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
+; instructions with B64, U64, and I64 take 64-bit operands.
+
+; FUNC-LABEL: {{^}}local_address_load:
+; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
+define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep:
+; SI: s_add_i32 [[SPTR:s[0-9]]]
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
+define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_const_offset:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
+define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Offset too large, can't fold into 16-bit immediate offset.
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
+define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
+  %1 = load i32, i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
+; SI: v_cmp_ne_i32
+; SI-NOT: v_cmp_ne_i32
+; SI: v_cndmask_b32
+define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+  %cmp = icmp ne i32 addrspace(3)* %lds, null
+  %x = select i1 %cmp, i32 123, i32 456
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul_32bit_ptr:
+; SI: s_mul_i32
+; SI-NEXT: s_add_i32
+; SI: ds_read_b32
+define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
+  %val = load float, float addrspace(3)* %ptr
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+@g_lds = addrspace(3) global float undef, align 4
+
+; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
+define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+  %val = load float, float addrspace(3)* @g_lds
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+
+@ptr = addrspace(3) global i32 addrspace(3)* undef
+@dst = addrspace(3) global [16384 x i32] undef
+
+; FUNC-LABEL: {{^}}global_ptr:
+; SI: ds_write_b32
+define void @global_ptr() nounwind {
+  store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_store:
+; SI: ds_write_b32
+define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+  store i32 %val, i32 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_store:
+; SI: s_add_i32 [[SADDR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
+; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
+define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
+define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
+
+; Offset too large, can't fold into 16-bit immediate offset.
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
+define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
+  store i32 %val, i32 addrspace(3)* %gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/README b/test/CodeGen/AMDGPU/README
new file mode 100644
index 000000000000..96998bba28f2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/README
@@ -0,0 +1,21 @@
++==============================================================================+
+| How to organize the lit tests                                                |
++==============================================================================+
+
+- If you write a test for matching a single DAG opcode or intrinsic, it should
+  go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll)
+
+- If you write a test that matches several DAG opcodes and checks for a single
+  ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g.
+  bfi_int.ll
+
+- For all other tests, use your best judgement for organizing tests and naming
+  the files.
+
++==============================================================================+
+| Naming conventions                                                           |
++==============================================================================+
+
+- Use dash '-' and not underscore '_' to separate words in file names, unless
+  the file is named after a DAG opcode or ISA instruction that has an
+  underscore '_' in its name.
diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll
new file mode 100644
index 000000000000..529905dd36a2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add-debug.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug
+; REQUIRES: asserts
+
+; Check that SelectionDAGDumper does not crash on int_SI_if.
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
new file mode 100644
index 000000000000..655e75dbc1a4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -0,0 +1,192 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test1:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-NOT: [[REG]]
+;SI: buffer_store_dword [[REG]],
+define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test2:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = add <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test4:
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = add <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test8:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+entry:
+  %0 = add <8 x i32> %a, %b
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test16:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+entry:
+  %0 = add <16 x i32> %a, %b
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}add64:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = add i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
+; use VCC.  The test is designed so that %a will be stored in an SGPR and
+; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
+; to a VGPR before doing the add.
+
+; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
+; SI-NOT: v_addc_u32_e32 s
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64, i64 addrspace(1)* %in
+  %1 = add i64 %a, %0
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test i64 add inside a branch.
+; FUNC-LABEL: {{^}}add64_in_branch:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: ADD_INT {{[* ]*}}[[LO]]
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
new file mode 100644
index 000000000000..8346add7df97
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; SI-LABEL: {{^}}test_i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = add i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Check that the SGPR add operand is correctly moved to a VGPR.
+; SI-LABEL: {{^}}sgpr_operand:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+  %foo = load i64, i64 addrspace(1)* %in, align 8
+  %result = add i64 %foo, %a
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Swap the arguments. Check that the SGPR -> VGPR copy works with the
+; SGPR as other operand.
+;
+; SI-LABEL: {{^}}sgpr_operand_reversed:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+  %foo = load i64, i64 addrspace(1)* %in, align 8
+  %result = add i64 %a, %foo
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_v2i64_sreg:
+; SI: s_add_u32
+; SI: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
+define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}test_v2i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_i64_add_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI-NOT: addc
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = add i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
new file mode 100644
index 000000000000..4be8c5847529
--- /dev/null
+++ b/test/CodeGen/AMDGPU/address-space.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; Test that codegenprepare understands address space sizes
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
+; already in a VGPR after the first read.
+
+; CHECK-LABEL: {{^}}do_as_ptr_calcs:
+; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]]
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20
+define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  br label %bb32
+
+bb32:
+  %a = load float, float addrspace(3)* %x, align 4
+  %b = load float, float addrspace(3)* %y, align 4
+  %cmp = fcmp one float %a, %b
+  br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+  unreachable
+
+bb34:
+  unreachable
+}
+
+
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
new file mode 100644
index 000000000000..5672d470bd7e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -0,0 +1,296 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test2:
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = and <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test4:
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = and <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i32:
+; SI: s_and_b32
+define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i32:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i32:
+; SI: v_and_b32
+define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %and = and i32 %a, %b
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
+define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 1234567
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
+define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, 64
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
+; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
+define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %and = and i32 %a, -16
+  store i32 %and, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_i64
+; SI: s_and_b64
+define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Should use SGPRs
+; FUNC-LABEL: {{^}}s_and_i1:
+; SI: v_and_b32
+define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  store i1 %and, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+  %and = and i64 %a, 281474976710655
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  br label %endif
+
+endif:
+  %tmp1 = phi i64 [%and, %if], [0, %entry]
+  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 1234567
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Replace and 0 with mov 0
+; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1
+define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
+define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4607182418800017408
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
+define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13830554455654793216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
+define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4602678819172646912
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
+define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13826050856027422720
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0
+define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4611686018427387904
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0
+define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13835058055282163712
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
+define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4616189618054758400
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
+define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13839561654909534208
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+
+; Test with the 64-bit integer bitpattern for a 32-bit float in the
+; low 32-bits, which is not a valid 64-bit inline immmediate.
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 1082130432
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Copy of -1 register
+; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
+; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}
+define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, -1065353216
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Shift into upper 32-bits
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 4647714815446351872
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64
+; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
+; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 13871086852301127680
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll
new file mode 100644
index 000000000000..48d8f3122495
--- /dev/null
+++ b/test/CodeGen/AMDGPU/anyext.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}anyext_i1_i32:
+; CHECK: v_cndmask_b32_e64
+define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  %1 = zext i1 %0 to i8
+  %2 = xor i8 %1, -1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
new file mode 100644
index 000000000000..8c2a0795860d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -0,0 +1,44 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+; The required pointer calculations for the alloca'd actually requires
+; an add and won't be folded into the addressing, which fails with a
+; 64-bit pointer add. This should work since private pointers should
+; be 32-bits.
+
+; SI-LABEL: {{^}}test_private_array_ptr_calc:
+
+; FIXME: We end up with zero argument for ADD, because
+; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
+; with the appropriate offset.  We should fold this into the store.
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
+;
+; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
+; alloca to a vector.  It currently fails because it does not know how
+; to interpret:
+; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
+; SI-PROMOTE: ds_write_b32 [[PTRREG]]
+define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %alloca = alloca [4 x i32], i32 4, align 16
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
+  store i32 %result, i32* %alloca_ptr, align 4
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+  %reload = load i32, i32* %alloca_ptr, align 4
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
new file mode 100644
index 000000000000..eae095eb8449
--- /dev/null
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() readnone
+
+; SI-LABEL: {{^}}test_array_ptr_calc:
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a_ptr
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
new file mode 100644
index 000000000000..ef2560ef1849
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
+; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
+; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
+; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll
new file mode 100644
index 000000000000..20c685447eef
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_local:
+; R600: LDS_ADD *
+; SI: ds_add_u32
+define void @atomic_add_local(i32 addrspace(3)* %local) {
+   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
+; R600: LDS_ADD *
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_ret_local:
+; R600: LDS_ADD_RET *
+; SI: ds_add_rtn_u32
+define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
+; R600: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
new file mode 100644
index 000000000000..4c6f45525b9e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_sub_local:
+; R600: LDS_SUB *
+; SI: ds_sub_u32
+define void @atomic_sub_local(i32 addrspace(3)* %local) {
+   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
+; R600: LDS_SUB *
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_ret_local:
+; R600: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32
+define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
+; R600: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
+define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
+  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
new file mode 100644
index 000000000000..abdc4afef472
--- /dev/null
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -0,0 +1,16 @@
+; XFAIL: *
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_branch(
+define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+  %cmp = icmp ne i32 %val, 0
+  br i1 %cmp, label %store, label %end
+
+store:
+  store i32 222, i32 addrspace(1)* %out
+  ret void
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll
new file mode 100644
index 000000000000..f0263caf5d6b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/basic-loop.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_loop:
+define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+entry:
+  br label %loop.body
+
+loop.body:
+  %i = phi i32 [0, %entry], [%i.inc, %loop.body]
+  store i32 222, i32 addrspace(1)* %out
+  %cmp = icmp ne i32 %i, %val
+  %i.inc = add i32 %i, 1
+  br i1 %cmp, label %loop.body, label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll
new file mode 100644
index 000000000000..32e3fc26106f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}bfe_def:
+; CHECK: BFE_UINT
+define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 5
+  %1 = and i32 %0, 15 ; 0xf
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; This program could be implemented using a BFE_UINT instruction, however
+; since the lshr constant + number of bits in the mask is >= 32, it can also be
+; implmented with a LSHR instruction, which is better, because LSHR has less
+; operands and requires less constants.
+
+; CHECK: {{^}}bfe_shift:
+; CHECK-NOT: BFE_UINT
+define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 16
+  %1 = and i32 %0, 65535 ; 0xffff
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll
new file mode 100644
index 000000000000..03349349735d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; BFI_INT Definition pattern from ISA docs
+; (y & x) | (z & ~x)
+;
+; R600: {{^}}bfi_def:
+; R600: BFI_INT
+; SI:   @bfi_def
+; SI:   v_bfi_b32
+define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %x, -1
+  %1 = and i32 %z, %0
+  %2 = and i32 %y, %x
+  %3 = or i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; SHA-256 Ch function
+; z ^ (x & (y ^ z))
+; R600: {{^}}bfi_sha256_ch:
+; R600: BFI_INT
+; SI:   @bfi_sha256_ch
+; SI:   v_bfi_b32
+define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %y, %z
+  %1 = and i32 %x, %0
+  %2 = xor i32 %z, %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; SHA-256 Ma function
+; ((x & z) | (y & (x | z)))
+; R600: {{^}}bfi_sha256_ma:
+; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
+; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
+; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
+; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
+
+define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = and i32 %x, %z
+  %1 = or i32 %x, %z
+  %2 = and i32 %y, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
new file mode 100644
index 000000000000..2671c5d102b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -0,0 +1,1173 @@
+;RUN: llc < %s -march=r600 -mcpu=cedar
+
+;This test ensures that R600 backend can handle ifcvt properly
+;and do not generate ALU clauses with more than 128 instructions.
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = extractelement <4 x float> %reg0, i32 1
+  %2 = extractelement <4 x float> %reg0, i32 2
+  %3 = extractelement <4 x float> %reg0, i32 3
+  %4 = extractelement <4 x float> %reg1, i32 0
+  %5 = extractelement <4 x float> %reg9, i32 0
+  %6 = extractelement <4 x float> %reg8, i32 0
+  %7 = fcmp ugt float %6, 0.000000e+00
+  %8 = select i1 %7, float %4, float %5
+  %9 = extractelement <4 x float> %reg1, i32 1
+  %10 = extractelement <4 x float> %reg9, i32 1
+  %11 = extractelement <4 x float> %reg8, i32 0
+  %12 = fcmp ugt float %11, 0.000000e+00
+  %13 = select i1 %12, float %9, float %10
+  %14 = extractelement <4 x float> %reg1, i32 2
+  %15 = extractelement <4 x float> %reg9, i32 2
+  %16 = extractelement <4 x float> %reg8, i32 0
+  %17 = fcmp ugt float %16, 0.000000e+00
+  %18 = select i1 %17, float %14, float %15
+  %19 = extractelement <4 x float> %reg1, i32 3
+  %20 = extractelement <4 x float> %reg9, i32 3
+  %21 = extractelement <4 x float> %reg8, i32 0
+  %22 = extractelement <4 x float> %reg2, i32 0
+  %23 = extractelement <4 x float> %reg2, i32 1
+  %24 = extractelement <4 x float> %reg2, i32 2
+  %25 = extractelement <4 x float> %reg2, i32 3
+  %26 = extractelement <4 x float> %reg3, i32 0
+  %27 = extractelement <4 x float> %reg3, i32 1
+  %28 = extractelement <4 x float> %reg3, i32 2
+  %29 = extractelement <4 x float> %reg3, i32 3
+  %30 = extractelement <4 x float> %reg4, i32 0
+  %31 = extractelement <4 x float> %reg4, i32 1
+  %32 = extractelement <4 x float> %reg4, i32 2
+  %33 = extractelement <4 x float> %reg4, i32 3
+  %34 = extractelement <4 x float> %reg5, i32 0
+  %35 = extractelement <4 x float> %reg5, i32 1
+  %36 = extractelement <4 x float> %reg5, i32 2
+  %37 = extractelement <4 x float> %reg5, i32 3
+  %38 = extractelement <4 x float> %reg6, i32 0
+  %39 = extractelement <4 x float> %reg6, i32 1
+  %40 = extractelement <4 x float> %reg6, i32 2
+  %41 = extractelement <4 x float> %reg6, i32 3
+  %42 = extractelement <4 x float> %reg7, i32 0
+  %43 = extractelement <4 x float> %reg7, i32 1
+  %44 = extractelement <4 x float> %reg7, i32 2
+  %45 = extractelement <4 x float> %reg7, i32 3
+  %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %47 = extractelement <4 x float> %46, i32 0
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %49 = extractelement <4 x float> %48, i32 1
+  %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %53 = extractelement <4 x float> %52, i32 0
+  %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %55 = extractelement <4 x float> %54, i32 0
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %57 = extractelement <4 x float> %56, i32 1
+  %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %59 = extractelement <4 x float> %58, i32 2
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %63 = extractelement <4 x float> %62, i32 0
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %65 = extractelement <4 x float> %64, i32 1
+  %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %67 = extractelement <4 x float> %66, i32 2
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %69 = extractelement <4 x float> %68, i32 0
+  %70 = fcmp oge float %69, 3.500000e+00
+  %71 = sext i1 %70 to i32
+  %72 = bitcast i32 %71 to float
+  %73 = bitcast float %72 to i32
+  %74 = icmp ne i32 %73, 0
+  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
+  %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %76 = extractelement <4 x float> %75, i32 0
+  %77 = fcmp oge float %76, 2.000000e+00
+  %78 = sext i1 %77 to i32
+  %79 = bitcast i32 %78 to float
+  %80 = bitcast float %79 to i32
+  %81 = icmp ne i32 %80, 0
+  br i1 %81, label %IF137, label %ENDIF136
+
+IF137:                                            ; preds = %main_body
+  %82 = insertelement <4 x float> undef, float %30, i32 0
+  %83 = insertelement <4 x float> %82, float %31, i32 1
+  %84 = insertelement <4 x float> %83, float %32, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = insertelement <4 x float> undef, float %30, i32 0
+  %87 = insertelement <4 x float> %86, float %31, i32 1
+  %88 = insertelement <4 x float> %87, float %32, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
+  %91 = call float @llvm.AMDGPU.rsq.f32(float %90)
+  %92 = fmul float %30, %91
+  %93 = fmul float %31, %91
+  %94 = fmul float %32, %91
+  %95 = insertelement <4 x float> undef, float %92, i32 0
+  %96 = insertelement <4 x float> %95, float %93, i32 1
+  %97 = insertelement <4 x float> %96, float %94, i32 2
+  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
+  %99 = insertelement <4 x float> undef, float %37, i32 0
+  %100 = insertelement <4 x float> %99, float %38, i32 1
+  %101 = insertelement <4 x float> %100, float %39, i32 2
+  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
+  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
+  %104 = insertelement <4 x float> undef, float %92, i32 0
+  %105 = insertelement <4 x float> %104, float %93, i32 1
+  %106 = insertelement <4 x float> %105, float %94, i32 2
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
+  %108 = insertelement <4 x float> undef, float %40, i32 0
+  %109 = insertelement <4 x float> %108, float %41, i32 1
+  %110 = insertelement <4 x float> %109, float %42, i32 2
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
+  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
+  %113 = fsub float -0.000000e+00, %92
+  %114 = fsub float -0.000000e+00, %93
+  %115 = fsub float -0.000000e+00, %94
+  %116 = insertelement <4 x float> undef, float %34, i32 0
+  %117 = insertelement <4 x float> %116, float %35, i32 1
+  %118 = insertelement <4 x float> %117, float %36, i32 2
+  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
+  %120 = insertelement <4 x float> undef, float %113, i32 0
+  %121 = insertelement <4 x float> %120, float %114, i32 1
+  %122 = insertelement <4 x float> %121, float %115, i32 2
+  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
+  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
+  %125 = fdiv float 1.000000e+00, %124
+  %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %127 = extractelement <4 x float> %126, i32 0
+  %128 = fmul float %127, %125
+  %129 = fmul float %103, %128
+  %130 = fmul float %112, %128
+  %131 = bitcast float %. to i32
+  %132 = sitofp i32 %131 to float
+  %133 = fdiv float 1.000000e+00, %132
+  %134 = bitcast float %. to i32
+  %135 = add i32 %134, -1
+  %136 = bitcast i32 %135 to float
+  %137 = bitcast float %136 to i32
+  br label %LOOP
+
+ENDIF136:                                         ; preds = %main_body, %ENDIF154
+  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+  %138 = fmul float %26, 0x3F847AE140000000
+  %139 = fmul float %27, 0x3F847AE140000000
+  %140 = fmul float %28, 0x3F847AE140000000
+  %141 = insertelement <4 x float> undef, float %138, i32 0
+  %142 = insertelement <4 x float> %141, float %139, i32 1
+  %143 = insertelement <4 x float> %142, float %140, i32 2
+  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
+  %145 = extractelement <4 x float> %144, i32 0
+  %146 = extractelement <4 x float> %144, i32 1
+  %147 = extractelement <4 x float> %144, i32 2
+  %148 = extractelement <4 x float> %144, i32 3
+  %149 = insertelement <4 x float> undef, float %145, i32 0
+  %150 = insertelement <4 x float> %149, float %146, i32 1
+  %151 = insertelement <4 x float> %150, float %147, i32 2
+  %152 = insertelement <4 x float> %151, float %148, i32 3
+  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
+  %154 = extractelement <4 x float> %153, i32 0
+  %155 = extractelement <4 x float> %153, i32 1
+  %156 = extractelement <4 x float> %153, i32 2
+  %157 = extractelement <4 x float> %153, i32 3
+  %158 = fmul float %26, 0x3F45A07B40000000
+  %159 = fmul float %27, 0x3F45A07B40000000
+  %160 = fmul float %28, 0x3F45A07B40000000
+  %161 = insertelement <4 x float> undef, float %158, i32 0
+  %162 = insertelement <4 x float> %161, float %159, i32 1
+  %163 = insertelement <4 x float> %162, float %160, i32 2
+  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
+  %165 = extractelement <4 x float> %164, i32 0
+  %166 = extractelement <4 x float> %164, i32 1
+  %167 = extractelement <4 x float> %164, i32 2
+  %168 = extractelement <4 x float> %164, i32 3
+  %169 = insertelement <4 x float> undef, float %165, i32 0
+  %170 = insertelement <4 x float> %169, float %166, i32 1
+  %171 = insertelement <4 x float> %170, float %167, i32 2
+  %172 = insertelement <4 x float> %171, float %168, i32 3
+  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
+  %174 = extractelement <4 x float> %173, i32 0
+  %175 = extractelement <4 x float> %173, i32 1
+  %176 = extractelement <4 x float> %173, i32 2
+  %177 = extractelement <4 x float> %173, i32 3
+  %178 = fmul float %176, 3.000000e+03
+  %179 = fadd float %178, %28
+  %180 = fdiv float 1.000000e+00, %33
+  %181 = fmul float %32, %180
+  %182 = call float @fabs(float %181)
+  %183 = fmul float %174, 0x3FD99999A0000000
+  %184 = fadd float %183, 0x3FAEB851E0000000
+  %185 = fmul float %175, 0x3FE3333340000000
+  %186 = fadd float %185, %184
+  %187 = fmul float %176, 2.000000e+00
+  %188 = fadd float %187, %186
+  %189 = fmul float %177, 4.000000e+00
+  %190 = fadd float %189, %188
+  %191 = fmul float %154, 0x3FB99999A0000000
+  %192 = fadd float %191, %190
+  %193 = fmul float %155, 0x3FD99999A0000000
+  %194 = fadd float %193, %192
+  %195 = fmul float %156, 0x3FE99999A0000000
+  %196 = fadd float %195, %194
+  %197 = fmul float %157, 0x4000CCCCC0000000
+  %198 = fadd float %197, %196
+  %199 = fmul float 0xBE5EFB4CC0000000, %182
+  %200 = fmul float %199, %182
+  %201 = call float @llvm.AMDIL.exp.(float %200)
+  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
+  %203 = fadd float %202, 0x3FF4CCCCC0000000
+  %204 = fmul float %203, 0x3FE1C71C80000000
+  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
+  %206 = fadd float %202, 0x3FF4CCCCC0000000
+  %207 = fmul float %206, 0x3FE1C71C80000000
+  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
+  %209 = fadd float %202, 2.000000e+00
+  %210 = fmul float %209, 0x3FD611A7A0000000
+  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
+  %212 = fmul float 2.000000e+00, %205
+  %213 = fsub float -0.000000e+00, %212
+  %214 = fadd float 3.000000e+00, %213
+  %215 = fmul float %205, %214
+  %216 = fmul float %205, %215
+  %217 = fmul float 2.000000e+00, %208
+  %218 = fsub float -0.000000e+00, %217
+  %219 = fadd float 3.000000e+00, %218
+  %220 = fmul float %208, %219
+  %221 = fmul float %208, %220
+  %222 = fmul float 2.000000e+00, %211
+  %223 = fsub float -0.000000e+00, %222
+  %224 = fadd float 3.000000e+00, %223
+  %225 = fmul float %211, %224
+  %226 = fmul float %211, %225
+  %227 = fmul float %26, 0x3F368B5CC0000000
+  %228 = fmul float %27, 0x3F368B5CC0000000
+  %229 = insertelement <4 x float> undef, float %227, i32 0
+  %230 = insertelement <4 x float> %229, float %228, i32 1
+  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
+  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
+  %233 = extractelement <4 x float> %232, i32 0
+  %234 = extractelement <4 x float> %232, i32 1
+  %235 = insertelement <4 x float> undef, float %233, i32 0
+  %236 = insertelement <4 x float> %235, float %234, i32 1
+  %237 = insertelement <4 x float> %236, float undef, i32 2
+  %238 = insertelement <4 x float> %237, float undef, i32 3
+  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
+  %240 = extractelement <4 x float> %239, i32 0
+  %241 = insertelement <4 x float> undef, float %240, i32 0
+  %242 = insertelement <4 x float> %241, float %228, i32 1
+  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
+  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
+  %245 = extractelement <4 x float> %244, i32 0
+  %246 = insertelement <4 x float> undef, float %245, i32 0
+  %247 = insertelement <4 x float> %246, float undef, i32 1
+  %248 = insertelement <4 x float> %247, float undef, i32 2
+  %249 = insertelement <4 x float> %248, float undef, i32 3
+  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
+  %251 = extractelement <4 x float> %250, i32 0
+  %252 = extractelement <4 x float> %250, i32 1
+  %253 = extractelement <4 x float> %250, i32 2
+  %254 = extractelement <4 x float> %250, i32 3
+  %255 = fmul float %251, %216
+  %256 = fmul float %252, %221
+  %257 = fmul float %253, %226
+  %258 = fmul float %254, 0.000000e+00
+  %259 = fadd float %202, 0x3FF4CCCCC0000000
+  %260 = fmul float %259, 0x3FE1C71C80000000
+  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
+  %262 = fadd float %202, 0x3FF4CCCCC0000000
+  %263 = fmul float %262, 0x3FE1C71C80000000
+  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
+  %265 = fadd float %202, 2.000000e+00
+  %266 = fmul float %265, 0x3FD611A7A0000000
+  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
+  %268 = fmul float 2.000000e+00, %261
+  %269 = fsub float -0.000000e+00, %268
+  %270 = fadd float 3.000000e+00, %269
+  %271 = fmul float %261, %270
+  %272 = fmul float %261, %271
+  %273 = fmul float 2.000000e+00, %264
+  %274 = fsub float -0.000000e+00, %273
+  %275 = fadd float 3.000000e+00, %274
+  %276 = fmul float %264, %275
+  %277 = fmul float %264, %276
+  %278 = fmul float 2.000000e+00, %267
+  %279 = fsub float -0.000000e+00, %278
+  %280 = fadd float 3.000000e+00, %279
+  %281 = fmul float %267, %280
+  %282 = fmul float %267, %281
+  %283 = fmul float %26, 0x3F22DFD6A0000000
+  %284 = fmul float %27, 0x3F22DFD6A0000000
+  %285 = insertelement <4 x float> undef, float %283, i32 0
+  %286 = insertelement <4 x float> %285, float %284, i32 1
+  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
+  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
+  %289 = extractelement <4 x float> %288, i32 0
+  %290 = extractelement <4 x float> %288, i32 1
+  %291 = insertelement <4 x float> undef, float %289, i32 0
+  %292 = insertelement <4 x float> %291, float %290, i32 1
+  %293 = insertelement <4 x float> %292, float undef, i32 2
+  %294 = insertelement <4 x float> %293, float undef, i32 3
+  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
+  %296 = extractelement <4 x float> %295, i32 0
+  %297 = extractelement <4 x float> %295, i32 1
+  %298 = extractelement <4 x float> %295, i32 2
+  %299 = extractelement <4 x float> %295, i32 3
+  %300 = fmul float %296, %272
+  %301 = fmul float %297, %277
+  %302 = fmul float %298, %282
+  %303 = fmul float %299, 0.000000e+00
+  %304 = fmul float %temp68.1, %37
+  %305 = fmul float %temp68.1, %38
+  %306 = fmul float %temp68.1, %39
+  %307 = fmul float %temp69.0, %40
+  %308 = fadd float %307, %304
+  %309 = fmul float %temp69.0, %41
+  %310 = fadd float %309, %305
+  %311 = fmul float %temp69.0, %42
+  %312 = fadd float %311, %306
+  %313 = fmul float %temp70.0, %34
+  %314 = fadd float %313, %308
+  %315 = fmul float %temp70.0, %35
+  %316 = fadd float %315, %310
+  %317 = fmul float %temp70.0, %36
+  %318 = fadd float %317, %312
+  %319 = insertelement <4 x float> undef, float %314, i32 0
+  %320 = insertelement <4 x float> %319, float %316, i32 1
+  %321 = insertelement <4 x float> %320, float %318, i32 2
+  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
+  %323 = insertelement <4 x float> undef, float %314, i32 0
+  %324 = insertelement <4 x float> %323, float %316, i32 1
+  %325 = insertelement <4 x float> %324, float %318, i32 2
+  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
+  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
+  %328 = call float @llvm.AMDGPU.rsq.f32(float %327)
+  %329 = fmul float %314, %328
+  %330 = fmul float %316, %328
+  %331 = fmul float %318, %328
+  %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %333 = extractelement <4 x float> %332, i32 0
+  %334 = fsub float -0.000000e+00, %333
+  %335 = fadd float 1.000000e+00, %334
+  %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %337 = extractelement <4 x float> %336, i32 0
+  %338 = fsub float -0.000000e+00, %337
+  %339 = fadd float 1.000000e+00, %338
+  %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %341 = extractelement <4 x float> %340, i32 0
+  %342 = fsub float -0.000000e+00, %341
+  %343 = fadd float 1.000000e+00, %342
+  %344 = fsub float -0.000000e+00, %335
+  %345 = fadd float %202, %344
+  %346 = fsub float -0.000000e+00, %339
+  %347 = fadd float %202, %346
+  %348 = fadd float %347, 0xBFE3333340000000
+  %349 = fsub float -0.000000e+00, %202
+  %350 = fsub float -0.000000e+00, %343
+  %351 = fadd float %349, %350
+  %352 = insertelement <4 x float> undef, float %43, i32 0
+  %353 = insertelement <4 x float> %352, float %44, i32 1
+  %354 = insertelement <4 x float> %353, float %45, i32 2
+  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
+  %356 = insertelement <4 x float> undef, float %43, i32 0
+  %357 = insertelement <4 x float> %356, float %44, i32 1
+  %358 = insertelement <4 x float> %357, float %45, i32 2
+  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
+  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
+  %361 = call float @llvm.AMDGPU.rsq.f32(float %360)
+  %362 = fmul float %45, %361
+  %363 = call float @fabs(float %362)
+  %364 = fmul float %176, 0x3FECCCCCC0000000
+  %365 = fadd float %364, %363
+  %366 = fadd float %365, 0xBFEFAE1480000000
+  %367 = fmul float %366, 0xC023FFFFC0000000
+  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
+  %369 = fsub float -0.000000e+00, %335
+  %370 = fadd float %202, %369
+  %371 = fadd float %370, 0x3FBEB851E0000000
+  %372 = fsub float -0.000000e+00, %339
+  %373 = fadd float %202, %372
+  %374 = fadd float %373, 0xBFE0A3D700000000
+  %375 = fsub float -0.000000e+00, %202
+  %376 = fsub float -0.000000e+00, %343
+  %377 = fadd float %375, %376
+  %378 = insertelement <4 x float> undef, float %43, i32 0
+  %379 = insertelement <4 x float> %378, float %44, i32 1
+  %380 = insertelement <4 x float> %379, float %45, i32 2
+  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
+  %382 = insertelement <4 x float> undef, float %43, i32 0
+  %383 = insertelement <4 x float> %382, float %44, i32 1
+  %384 = insertelement <4 x float> %383, float %45, i32 2
+  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
+  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
+  %387 = call float @llvm.AMDGPU.rsq.f32(float %386)
+  %388 = fmul float %45, %387
+  %389 = call float @fabs(float %388)
+  %390 = fmul float %176, 0x3FF51EB860000000
+  %391 = fadd float %390, %389
+  %392 = fadd float %391, 0xBFEFAE1480000000
+  %393 = fmul float %392, 0xC0490001A0000000
+  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
+  %395 = fmul float 2.000000e+00, %368
+  %396 = fsub float -0.000000e+00, %395
+  %397 = fadd float 3.000000e+00, %396
+  %398 = fmul float %368, %397
+  %399 = fmul float %368, %398
+  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
+  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
+  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
+  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
+  %404 = fmul float 2.000000e+00, %394
+  %405 = fsub float -0.000000e+00, %404
+  %406 = fadd float 3.000000e+00, %405
+  %407 = fmul float %394, %406
+  %408 = fmul float %394, %407
+  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
+  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
+  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
+  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
+  %413 = fcmp oge float 2.200000e+03, %179
+  %414 = sext i1 %413 to i32
+  %415 = bitcast i32 %414 to float
+  %416 = bitcast float %415 to i32
+  %417 = icmp ne i32 %416, 0
+  br i1 %417, label %IF161, label %ENDIF160
+
+LOOP:                                             ; preds = %ENDIF139, %IF137
+  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+  %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
+  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
+  %418 = bitcast float %temp96.0 to i32
+  %419 = icmp sge i32 %418, %137
+  %420 = sext i1 %419 to i32
+  %421 = bitcast i32 %420 to float
+  %422 = bitcast float %421 to i32
+  %423 = icmp ne i32 %422, 0
+  br i1 %423, label %IF140, label %ENDIF139
+
+IF140:                                            ; preds = %LOOP
+  %424 = fmul float %133, 5.000000e-01
+  %425 = fmul float %129, %temp92.0
+  %426 = fadd float %425, %22
+  %427 = fmul float %130, %temp92.0
+  %428 = fadd float %427, %23
+  %429 = insertelement <4 x float> undef, float %426, i32 0
+  %430 = insertelement <4 x float> %429, float %428, i32 1
+  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
+  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
+  %433 = extractelement <4 x float> %432, i32 0
+  %434 = extractelement <4 x float> %432, i32 1
+  %435 = insertelement <4 x float> undef, float %433, i32 0
+  %436 = insertelement <4 x float> %435, float %434, i32 1
+  %437 = insertelement <4 x float> %436, float undef, i32 2
+  %438 = insertelement <4 x float> %437, float undef, i32 3
+  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
+  %440 = extractelement <4 x float> %439, i32 3
+  %441 = fcmp oge float %temp92.0, %440
+  %442 = sext i1 %441 to i32
+  %443 = bitcast i32 %442 to float
+  %444 = bitcast float %443 to i32
+  %445 = icmp ne i32 %444, 0
+  br i1 %445, label %IF146, label %ENDIF145
+
+ENDIF139:                                         ; preds = %LOOP
+  %446 = fadd float %temp88.0, %133
+  %447 = fmul float %129, %446
+  %448 = fadd float %447, %22
+  %449 = fmul float %130, %446
+  %450 = fadd float %449, %23
+  %451 = insertelement <4 x float> undef, float %448, i32 0
+  %452 = insertelement <4 x float> %451, float %450, i32 1
+  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
+  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
+  %455 = extractelement <4 x float> %454, i32 0
+  %456 = extractelement <4 x float> %454, i32 1
+  %457 = insertelement <4 x float> undef, float %455, i32 0
+  %458 = insertelement <4 x float> %457, float %456, i32 1
+  %459 = insertelement <4 x float> %458, float undef, i32 2
+  %460 = insertelement <4 x float> %459, float undef, i32 3
+  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
+  %462 = extractelement <4 x float> %461, i32 3
+  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+  %464 = sext i1 %463 to i32
+  %465 = bitcast i32 %464 to float
+  %466 = fcmp oge float %446, %462
+  %467 = sext i1 %466 to i32
+  %468 = bitcast i32 %467 to float
+  %469 = bitcast float %465 to i32
+  %470 = bitcast float %468 to i32
+  %471 = and i32 %469, %470
+  %472 = bitcast i32 %471 to float
+  %473 = bitcast float %472 to i32
+  %474 = icmp ne i32 %473, 0
+  %.temp92.0 = select i1 %474, float %446, float %temp92.0
+  %475 = bitcast float %temp96.0 to i32
+  %476 = add i32 %475, 1
+  %477 = bitcast i32 %476 to float
+  br label %LOOP
+
+IF146:                                            ; preds = %IF140
+  %478 = fmul float 2.000000e+00, %424
+  %479 = fsub float -0.000000e+00, %478
+  %480 = fadd float %temp92.0, %479
+  br label %ENDIF145
+
+ENDIF145:                                         ; preds = %IF140, %IF146
+  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
+  %481 = fadd float %temp88.1, %424
+  %482 = fmul float %424, 5.000000e-01
+  %483 = fmul float %129, %481
+  %484 = fadd float %483, %22
+  %485 = fmul float %130, %481
+  %486 = fadd float %485, %23
+  %487 = insertelement <4 x float> undef, float %484, i32 0
+  %488 = insertelement <4 x float> %487, float %486, i32 1
+  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
+  %490 = insertelement <4 x float> %489, float %440, i32 3
+  %491 = extractelement <4 x float> %490, i32 0
+  %492 = extractelement <4 x float> %490, i32 1
+  %493 = insertelement <4 x float> undef, float %491, i32 0
+  %494 = insertelement <4 x float> %493, float %492, i32 1
+  %495 = insertelement <4 x float> %494, float undef, i32 2
+  %496 = insertelement <4 x float> %495, float undef, i32 3
+  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
+  %498 = extractelement <4 x float> %497, i32 3
+  %499 = fcmp oge float %481, %498
+  %500 = sext i1 %499 to i32
+  %501 = bitcast i32 %500 to float
+  %502 = bitcast float %501 to i32
+  %503 = icmp ne i32 %502, 0
+  br i1 %503, label %IF149, label %ENDIF148
+
+IF149:                                            ; preds = %ENDIF145
+  %504 = fmul float 2.000000e+00, %482
+  %505 = fsub float -0.000000e+00, %504
+  %506 = fadd float %481, %505
+  br label %ENDIF148
+
+ENDIF148:                                         ; preds = %ENDIF145, %IF149
+  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
+  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
+  %507 = fadd float %temp88.2, %482
+  %508 = fmul float %482, 5.000000e-01
+  %509 = fmul float %129, %507
+  %510 = fadd float %509, %22
+  %511 = fmul float %130, %507
+  %512 = fadd float %511, %23
+  %513 = insertelement <4 x float> undef, float %510, i32 0
+  %514 = insertelement <4 x float> %513, float %512, i32 1
+  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
+  %516 = insertelement <4 x float> %515, float %498, i32 3
+  %517 = extractelement <4 x float> %516, i32 0
+  %518 = extractelement <4 x float> %516, i32 1
+  %519 = insertelement <4 x float> undef, float %517, i32 0
+  %520 = insertelement <4 x float> %519, float %518, i32 1
+  %521 = insertelement <4 x float> %520, float undef, i32 2
+  %522 = insertelement <4 x float> %521, float undef, i32 3
+  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
+  %524 = extractelement <4 x float> %523, i32 3
+  %525 = fcmp oge float %507, %524
+  %526 = sext i1 %525 to i32
+  %527 = bitcast i32 %526 to float
+  %528 = bitcast float %527 to i32
+  %529 = icmp ne i32 %528, 0
+  br i1 %529, label %IF152, label %ENDIF151
+
+IF152:                                            ; preds = %ENDIF148
+  %530 = fmul float 2.000000e+00, %508
+  %531 = fsub float -0.000000e+00, %530
+  %532 = fadd float %507, %531
+  br label %ENDIF151
+
+ENDIF151:                                         ; preds = %ENDIF148, %IF152
+  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
+  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
+  %533 = fadd float %temp88.3, %508
+  %534 = fmul float %508, 5.000000e-01
+  %535 = fmul float %129, %533
+  %536 = fadd float %535, %22
+  %537 = fmul float %130, %533
+  %538 = fadd float %537, %23
+  %539 = insertelement <4 x float> undef, float %536, i32 0
+  %540 = insertelement <4 x float> %539, float %538, i32 1
+  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
+  %542 = insertelement <4 x float> %541, float %524, i32 3
+  %543 = extractelement <4 x float> %542, i32 0
+  %544 = extractelement <4 x float> %542, i32 1
+  %545 = insertelement <4 x float> undef, float %543, i32 0
+  %546 = insertelement <4 x float> %545, float %544, i32 1
+  %547 = insertelement <4 x float> %546, float undef, i32 2
+  %548 = insertelement <4 x float> %547, float undef, i32 3
+  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
+  %550 = extractelement <4 x float> %549, i32 3
+  %551 = fcmp oge float %533, %550
+  %552 = sext i1 %551 to i32
+  %553 = bitcast i32 %552 to float
+  %554 = bitcast float %553 to i32
+  %555 = icmp ne i32 %554, 0
+  br i1 %555, label %IF155, label %ENDIF154
+
+IF155:                                            ; preds = %ENDIF151
+  %556 = fmul float 2.000000e+00, %534
+  %557 = fsub float -0.000000e+00, %556
+  %558 = fadd float %533, %557
+  br label %ENDIF154
+
+ENDIF154:                                         ; preds = %ENDIF151, %IF155
+  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
+  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+  %559 = fadd float %temp88.4, %534
+  %560 = fmul float %129, %559
+  %561 = fadd float %560, %22
+  %562 = fmul float %130, %559
+  %563 = fadd float %562, %23
+  %564 = insertelement <4 x float> undef, float %561, i32 0
+  %565 = insertelement <4 x float> %564, float %563, i32 1
+  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
+  %567 = insertelement <4 x float> %566, float %550, i32 3
+  %568 = extractelement <4 x float> %567, i32 0
+  %569 = extractelement <4 x float> %567, i32 1
+  %570 = insertelement <4 x float> undef, float %568, i32 0
+  %571 = insertelement <4 x float> %570, float %569, i32 1
+  %572 = insertelement <4 x float> %571, float undef, i32 2
+  %573 = insertelement <4 x float> %572, float undef, i32 3
+  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
+  %575 = extractelement <4 x float> %574, i32 3
+  %576 = fcmp oge float %559, %575
+  %577 = sext i1 %576 to i32
+  %578 = bitcast i32 %577 to float
+  %579 = bitcast float %578 to i32
+  %580 = icmp ne i32 %579, 0
+  %.temp92.4 = select i1 %580, float %559, float %temp92.4
+  %581 = fmul float %129, %.temp92.4
+  %582 = fadd float %581, %22
+  %583 = fmul float %130, %.temp92.4
+  %584 = fadd float %583, %23
+  %585 = insertelement <4 x float> undef, float %582, i32 0
+  %586 = insertelement <4 x float> %585, float %584, i32 1
+  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
+  %588 = insertelement <4 x float> %587, float %575, i32 3
+  %589 = extractelement <4 x float> %588, i32 0
+  %590 = extractelement <4 x float> %588, i32 1
+  %591 = insertelement <4 x float> undef, float %589, i32 0
+  %592 = insertelement <4 x float> %591, float %590, i32 1
+  %593 = insertelement <4 x float> %592, float undef, i32 2
+  %594 = insertelement <4 x float> %593, float undef, i32 3
+  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
+  %596 = extractelement <4 x float> %595, i32 0
+  %597 = extractelement <4 x float> %595, i32 1
+  %598 = extractelement <4 x float> %595, i32 2
+  %599 = fmul float %596, 2.000000e+00
+  %600 = fadd float %599, -1.000000e+00
+  %601 = fmul float %597, 2.000000e+00
+  %602 = fadd float %601, -1.000000e+00
+  %603 = fmul float %598, 2.000000e+00
+  %604 = fadd float %603, -1.000000e+00
+  br label %ENDIF136
+
+IF161:                                            ; preds = %ENDIF136
+  %605 = fmul float %202, 0x3FB99999A0000000
+  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
+  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
+  %608 = fcmp uge float %607, 5.000000e-01
+  %609 = select i1 %608, float 5.000000e-01, float %607
+  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
+  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
+  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
+  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
+  %614 = insertelement <4 x float> undef, float %329, i32 0
+  %615 = insertelement <4 x float> %614, float %330, i32 1
+  %616 = insertelement <4 x float> %615, float %331, i32 2
+  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
+  %618 = insertelement <4 x float> undef, float %63, i32 0
+  %619 = insertelement <4 x float> %618, float %65, i32 1
+  %620 = insertelement <4 x float> %619, float %67, i32 2
+  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
+  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
+  %623 = fcmp uge float 0x3FE6666660000000, %622
+  %624 = select i1 %623, float 0x3FE6666660000000, float %622
+  %625 = fmul float %8, %624
+  %626 = fmul float %13, %624
+  %627 = fmul float %18, %624
+  %628 = insertelement <4 x float> undef, float %34, i32 0
+  %629 = insertelement <4 x float> %628, float %35, i32 1
+  %630 = insertelement <4 x float> %629, float %36, i32 2
+  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
+  %632 = insertelement <4 x float> undef, float %63, i32 0
+  %633 = insertelement <4 x float> %632, float %65, i32 1
+  %634 = insertelement <4 x float> %633, float %67, i32 2
+  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
+  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
+  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
+  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
+  %639 = fmul float %625, %638
+  %640 = fmul float %626, %638
+  %641 = fmul float %627, %638
+  br label %ENDIF160
+
+ENDIF160:                                         ; preds = %ENDIF136, %IF161
+  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
+  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
+  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
+  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
+  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
+  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %642 = fcmp olt float 2.200000e+03, %179
+  %643 = sext i1 %642 to i32
+  %644 = bitcast i32 %643 to float
+  %645 = fcmp olt float %179, 2.300000e+03
+  %646 = sext i1 %645 to i32
+  %647 = bitcast i32 %646 to float
+  %648 = bitcast float %644 to i32
+  %649 = bitcast float %647 to i32
+  %650 = and i32 %648, %649
+  %651 = bitcast i32 %650 to float
+  %652 = bitcast float %651 to i32
+  %653 = icmp ne i32 %652, 0
+  br i1 %653, label %IF164, label %ENDIF163
+
+IF164:                                            ; preds = %ENDIF160
+  %654 = fmul float %202, 5.000000e-01
+  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
+  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
+  %657 = fcmp uge float %656, 0x3FD6666660000000
+  %658 = select i1 %657, float 0x3FD6666660000000, float %656
+  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
+  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
+  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
+  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
+  %663 = insertelement <4 x float> undef, float %329, i32 0
+  %664 = insertelement <4 x float> %663, float %330, i32 1
+  %665 = insertelement <4 x float> %664, float %331, i32 2
+  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
+  %667 = insertelement <4 x float> undef, float %63, i32 0
+  %668 = insertelement <4 x float> %667, float %65, i32 1
+  %669 = insertelement <4 x float> %668, float %67, i32 2
+  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
+  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
+  %672 = fcmp uge float 0x3FE6666660000000, %671
+  %673 = select i1 %672, float 0x3FE6666660000000, float %671
+  %674 = fmul float %8, %673
+  %675 = fmul float %13, %673
+  %676 = fmul float %18, %673
+  %677 = insertelement <4 x float> undef, float %34, i32 0
+  %678 = insertelement <4 x float> %677, float %35, i32 1
+  %679 = insertelement <4 x float> %678, float %36, i32 2
+  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
+  %681 = insertelement <4 x float> undef, float %63, i32 0
+  %682 = insertelement <4 x float> %681, float %65, i32 1
+  %683 = insertelement <4 x float> %682, float %67, i32 2
+  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
+  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
+  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
+  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
+  %688 = fmul float %674, %687
+  %689 = fmul float %675, %687
+  %690 = fmul float %676, %687
+  br label %ENDIF163
+
+ENDIF163:                                         ; preds = %ENDIF160, %IF164
+  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
+  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
+  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
+  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
+  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
+  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
+  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
+  %691 = fcmp oge float %179, 2.300000e+03
+  %692 = sext i1 %691 to i32
+  %693 = bitcast i32 %692 to float
+  %694 = fcmp olt float %179, 2.480000e+03
+  %695 = sext i1 %694 to i32
+  %696 = bitcast i32 %695 to float
+  %697 = bitcast float %693 to i32
+  %698 = bitcast float %696 to i32
+  %699 = and i32 %697, %698
+  %700 = bitcast i32 %699 to float
+  %701 = bitcast float %700 to i32
+  %702 = icmp ne i32 %701, 0
+  br i1 %702, label %IF167, label %ENDIF166
+
+IF167:                                            ; preds = %ENDIF163
+  %703 = fmul float %202, 5.000000e-01
+  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
+  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
+  %706 = fcmp uge float %705, 0x3FD3333340000000
+  %707 = select i1 %706, float 0x3FD3333340000000, float %705
+  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
+  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
+  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
+  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
+  %712 = insertelement <4 x float> undef, float %329, i32 0
+  %713 = insertelement <4 x float> %712, float %330, i32 1
+  %714 = insertelement <4 x float> %713, float %331, i32 2
+  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
+  %716 = insertelement <4 x float> undef, float %63, i32 0
+  %717 = insertelement <4 x float> %716, float %65, i32 1
+  %718 = insertelement <4 x float> %717, float %67, i32 2
+  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
+  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
+  %721 = fcmp uge float 0x3FEB333340000000, %720
+  %722 = select i1 %721, float 0x3FEB333340000000, float %720
+  %723 = fmul float %8, %722
+  %724 = fmul float %13, %722
+  %725 = fmul float %18, %722
+  %726 = insertelement <4 x float> undef, float %34, i32 0
+  %727 = insertelement <4 x float> %726, float %35, i32 1
+  %728 = insertelement <4 x float> %727, float %36, i32 2
+  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
+  %730 = insertelement <4 x float> undef, float %63, i32 0
+  %731 = insertelement <4 x float> %730, float %65, i32 1
+  %732 = insertelement <4 x float> %731, float %67, i32 2
+  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
+  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
+  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
+  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
+  %737 = fmul float %723, %736
+  %738 = fmul float %724, %736
+  %739 = fmul float %725, %736
+  br label %ENDIF166
+
+ENDIF166:                                         ; preds = %ENDIF163, %IF167
+  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
+  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
+  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
+  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
+  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
+  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
+  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
+  %740 = fcmp oge float %179, 2.480000e+03
+  %741 = sext i1 %740 to i32
+  %742 = bitcast i32 %741 to float
+  %743 = fcmp olt float %179, 2.530000e+03
+  %744 = sext i1 %743 to i32
+  %745 = bitcast i32 %744 to float
+  %746 = bitcast float %742 to i32
+  %747 = bitcast float %745 to i32
+  %748 = and i32 %746, %747
+  %749 = bitcast i32 %748 to float
+  %750 = bitcast float %749 to i32
+  %751 = icmp ne i32 %750, 0
+  br i1 %751, label %IF170, label %ENDIF169
+
+IF170:                                            ; preds = %ENDIF166
+  %752 = fmul float %202, 5.000000e-01
+  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
+  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
+  %755 = fcmp uge float %754, 0x3FC99999A0000000
+  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
+  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
+  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
+  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
+  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
+  %761 = insertelement <4 x float> undef, float %329, i32 0
+  %762 = insertelement <4 x float> %761, float %330, i32 1
+  %763 = insertelement <4 x float> %762, float %331, i32 2
+  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
+  %765 = insertelement <4 x float> undef, float %63, i32 0
+  %766 = insertelement <4 x float> %765, float %65, i32 1
+  %767 = insertelement <4 x float> %766, float %67, i32 2
+  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
+  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
+  %770 = fcmp uge float 0x3FEB333340000000, %769
+  %771 = select i1 %770, float 0x3FEB333340000000, float %769
+  %772 = fmul float %8, %771
+  %773 = fmul float %13, %771
+  %774 = fmul float %18, %771
+  %775 = insertelement <4 x float> undef, float %34, i32 0
+  %776 = insertelement <4 x float> %775, float %35, i32 1
+  %777 = insertelement <4 x float> %776, float %36, i32 2
+  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
+  %779 = insertelement <4 x float> undef, float %63, i32 0
+  %780 = insertelement <4 x float> %779, float %65, i32 1
+  %781 = insertelement <4 x float> %780, float %67, i32 2
+  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
+  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
+  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
+  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
+  %786 = fmul float %772, %785
+  %787 = fmul float %773, %785
+  %788 = fmul float %774, %785
+  br label %ENDIF169
+
+ENDIF169:                                         ; preds = %ENDIF166, %IF170
+  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
+  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
+  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
+  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
+  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
+  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
+  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
+  %789 = fcmp oge float %179, 2.530000e+03
+  %790 = sext i1 %789 to i32
+  %791 = bitcast i32 %790 to float
+  %792 = fcmp olt float %179, 2.670000e+03
+  %793 = sext i1 %792 to i32
+  %794 = bitcast i32 %793 to float
+  %795 = bitcast float %791 to i32
+  %796 = bitcast float %794 to i32
+  %797 = and i32 %795, %796
+  %798 = bitcast i32 %797 to float
+  %799 = bitcast float %798 to i32
+  %800 = icmp ne i32 %799, 0
+  br i1 %800, label %IF173, label %ENDIF172
+
+IF173:                                            ; preds = %ENDIF169
+  %801 = fmul float %202, 5.000000e-01
+  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
+  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
+  %804 = fcmp uge float %803, 0x3FB99999A0000000
+  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
+  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
+  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
+  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
+  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
+  %810 = insertelement <4 x float> undef, float %329, i32 0
+  %811 = insertelement <4 x float> %810, float %330, i32 1
+  %812 = insertelement <4 x float> %811, float %331, i32 2
+  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
+  %814 = insertelement <4 x float> undef, float %63, i32 0
+  %815 = insertelement <4 x float> %814, float %65, i32 1
+  %816 = insertelement <4 x float> %815, float %67, i32 2
+  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
+  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
+  %819 = fcmp uge float 0x3FEB333340000000, %818
+  %820 = select i1 %819, float 0x3FEB333340000000, float %818
+  %821 = fmul float %8, %820
+  %822 = fmul float %13, %820
+  %823 = fmul float %18, %820
+  %824 = insertelement <4 x float> undef, float %34, i32 0
+  %825 = insertelement <4 x float> %824, float %35, i32 1
+  %826 = insertelement <4 x float> %825, float %36, i32 2
+  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
+  %828 = insertelement <4 x float> undef, float %63, i32 0
+  %829 = insertelement <4 x float> %828, float %65, i32 1
+  %830 = insertelement <4 x float> %829, float %67, i32 2
+  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
+  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
+  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
+  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
+  %835 = fmul float %821, %834
+  %836 = fmul float %822, %834
+  %837 = fmul float %823, %834
+  br label %ENDIF172
+
+ENDIF172:                                         ; preds = %ENDIF169, %IF173
+  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
+  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
+  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
+  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
+  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
+  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
+  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
+  %838 = fcmp oge float %179, 2.670000e+03
+  %839 = sext i1 %838 to i32
+  %840 = bitcast i32 %839 to float
+  %841 = bitcast float %840 to i32
+  %842 = icmp ne i32 %841, 0
+  br i1 %842, label %IF176, label %ENDIF175
+
+IF176:                                            ; preds = %ENDIF172
+  %843 = fmul float %202, 0x3FB99999A0000000
+  %844 = fcmp uge float 0.000000e+00, %843
+  %845 = select i1 %844, float 0.000000e+00, float %843
+  %846 = fcmp uge float %845, 0x3FD99999A0000000
+  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
+  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
+  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
+  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
+  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
+  %852 = insertelement <4 x float> undef, float %329, i32 0
+  %853 = insertelement <4 x float> %852, float %330, i32 1
+  %854 = insertelement <4 x float> %853, float %331, i32 2
+  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
+  %856 = insertelement <4 x float> undef, float %63, i32 0
+  %857 = insertelement <4 x float> %856, float %65, i32 1
+  %858 = insertelement <4 x float> %857, float %67, i32 2
+  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
+  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
+  %861 = fcmp uge float 0x3FEB333340000000, %860
+  %862 = select i1 %861, float 0x3FEB333340000000, float %860
+  %863 = fmul float %8, %862
+  %864 = fmul float %13, %862
+  %865 = fmul float %18, %862
+  %866 = insertelement <4 x float> undef, float %34, i32 0
+  %867 = insertelement <4 x float> %866, float %35, i32 1
+  %868 = insertelement <4 x float> %867, float %36, i32 2
+  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
+  %870 = insertelement <4 x float> undef, float %63, i32 0
+  %871 = insertelement <4 x float> %870, float %65, i32 1
+  %872 = insertelement <4 x float> %871, float %67, i32 2
+  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
+  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
+  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
+  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
+  %877 = fmul float %863, %876
+  %878 = fmul float %864, %876
+  %879 = fmul float %865, %876
+  br label %ENDIF175
+
+ENDIF175:                                         ; preds = %ENDIF172, %IF176
+  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
+  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
+  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
+  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
+  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
+  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
+  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
+  %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %881 = extractelement <4 x float> %880, i32 0
+  %882 = fcmp olt float %881, %179
+  %883 = sext i1 %882 to i32
+  %884 = bitcast i32 %883 to float
+  %885 = bitcast float %884 to i32
+  %886 = icmp ne i32 %885, 0
+  br i1 %886, label %IF179, label %ENDIF178
+
+IF179:                                            ; preds = %ENDIF175
+  %887 = fadd float %202, 1.000000e+00
+  %888 = fadd float %202, 1.000000e+00
+  %889 = fadd float %202, 1.000000e+00
+  %890 = insertelement <4 x float> undef, float %43, i32 0
+  %891 = insertelement <4 x float> %890, float %44, i32 1
+  %892 = insertelement <4 x float> %891, float %45, i32 2
+  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
+  %894 = insertelement <4 x float> undef, float %43, i32 0
+  %895 = insertelement <4 x float> %894, float %44, i32 1
+  %896 = insertelement <4 x float> %895, float %45, i32 2
+  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
+  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
+  %899 = call float @llvm.AMDGPU.rsq.f32(float %898)
+  %900 = fmul float %45, %899
+  %901 = call float @fabs(float %900)
+  %902 = fmul float %176, 0x3FECCCCCC0000000
+  %903 = fadd float %902, %901
+  %904 = fadd float %903, 0xBFEFAE1480000000
+  %905 = fmul float %904, 0xC043FFFE20000000
+  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
+  %907 = fmul float 2.000000e+00, %906
+  %908 = fsub float -0.000000e+00, %907
+  %909 = fadd float 3.000000e+00, %908
+  %910 = fmul float %906, %909
+  %911 = fmul float %906, %910
+  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
+  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
+  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
+  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
+  %916 = fmul float %202, 5.000000e-01
+  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
+  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
+  %919 = fcmp uge float %918, 0x3FE3333340000000
+  %920 = select i1 %919, float 0x3FE3333340000000, float %918
+  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
+  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
+  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
+  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
+  %925 = insertelement <4 x float> undef, float %329, i32 0
+  %926 = insertelement <4 x float> %925, float %330, i32 1
+  %927 = insertelement <4 x float> %926, float %331, i32 2
+  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
+  %929 = insertelement <4 x float> undef, float %63, i32 0
+  %930 = insertelement <4 x float> %929, float %65, i32 1
+  %931 = insertelement <4 x float> %930, float %67, i32 2
+  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
+  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
+  %934 = fcmp uge float 0x3FE99999A0000000, %933
+  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
+  %936 = fmul float %8, %935
+  %937 = fmul float %13, %935
+  %938 = fmul float %18, %935
+  %939 = insertelement <4 x float> undef, float %34, i32 0
+  %940 = insertelement <4 x float> %939, float %35, i32 1
+  %941 = insertelement <4 x float> %940, float %36, i32 2
+  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
+  %943 = insertelement <4 x float> undef, float %63, i32 0
+  %944 = insertelement <4 x float> %943, float %65, i32 1
+  %945 = insertelement <4 x float> %944, float %67, i32 2
+  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
+  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
+  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
+  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
+  %950 = fmul float %936, %949
+  %951 = fmul float %937, %949
+  %952 = fmul float %938, %949
+  br label %ENDIF178
+
+ENDIF178:                                         ; preds = %ENDIF175, %IF179
+  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
+  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
+  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
+  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
+  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
+  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
+  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
+  %953 = fmul float %55, %temp92.12
+  %954 = fmul float %57, %temp93.6
+  %955 = fmul float %59, %temp94.6
+  %956 = fmul float %61, 0.000000e+00
+  %957 = fmul float %temp84.6, %953
+  %958 = fmul float %temp85.6, %954
+  %959 = fmul float %temp86.6, %955
+  %960 = fmul float %temp87.6, %956
+  %961 = fmul float %2, -2.000000e+00
+  %962 = fadd float %961, 1.000000e+00
+  %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %964 = extractelement <4 x float> %963, i32 2
+  %965 = fsub float -0.000000e+00, %964
+  %966 = fadd float %962, %965
+  %967 = fdiv float 1.000000e+00, %966
+  %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %969 = extractelement <4 x float> %968, i32 2
+  %970 = fmul float %969, %967
+  %971 = fsub float -0.000000e+00, %53
+  %972 = fmul float %971, %53
+  %973 = fmul float %972, %970
+  %974 = fmul float %973, %970
+  %975 = fmul float %974, 0x3FF7154760000000
+  %976 = call float @llvm.AMDIL.exp.(float %975)
+  %977 = fcmp oeq float %53, 1.000000e+00
+  %978 = sext i1 %977 to i32
+  %979 = bitcast i32 %978 to float
+  %980 = bitcast float %979 to i32
+  %981 = icmp ne i32 %980, 0
+  %.184 = select i1 %981, float 1.000000e+00, float %976
+  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
+  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
+  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
+  %985 = insertelement <4 x float> undef, float %982, i32 0
+  %986 = insertelement <4 x float> %985, float %983, i32 1
+  %987 = insertelement <4 x float> %986, float %984, i32 2
+  %988 = insertelement <4 x float> %987, float %960, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
diff --git a/test/CodeGen/AMDGPU/bitcast.ll b/test/CodeGen/AMDGPU/bitcast.ll
new file mode 100644
index 000000000000..fd56d956bf31
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitcast.ll
@@ -0,0 +1,79 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; This test just checks that the compiler doesn't crash.
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+; FUNC-LABEL: {{^}}v32i8_to_v8i32:
+; SI: s_endpgm
+define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+entry:
+  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
+  %2 = bitcast <32 x i8> %1 to <8 x i32>
+  %3 = extractelement <8 x i32> %2, i32 1
+  %4 = icmp ne i32 %3, 0
+  %5 = select i1 %4, float 0.0, float 1.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
+; SI: s_endpgm
+define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
+  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
+  store <16 x i8> %1, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float, float addrspace(1)* %in, align 4
+  %bc = bitcast float %load to <2 x i16>
+  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+  %bc = bitcast <2 x i16> %load to float
+  store float %bc, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %bc = bitcast <4 x i8> %load to i32
+  store i32 %bc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %load to <4 x i8>
+  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
+; SI: s_endpgm
+define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %add = add <2 x i32> %val, <i32 4, i32 9>
+  %bc = bitcast <2 x i32> %add to double
+  store double %bc, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
+; SI: s_endpgm
+define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+  %val = load double, double addrspace(1)* %in, align 8
+  %add = fadd double %val, 4.0
+  %bc = bitcast double %add to <2 x i32>
+  store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
new file mode 100644
index 000000000000..4cf8e4bfed50
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
+
+; FUNC-LABEL: @test_bswap_i32
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
+; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
+  store i32 %bswap, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v2i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v4i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_bswap_v8i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
+  store i64 %bswap, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
+  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
+  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll
new file mode 100644
index 000000000000..65eacf5adc41
--- /dev/null
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+
+; R600: {{^}}build_vector2:
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector2:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
+define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+entry:
+  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600: {{^}}build_vector4:
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600: MOV
+; R600-NOT: MOV
+; SI: {{^}}build_vector4:
+; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
+; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
+; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
+define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
new file mode 100644
index 000000000000..e769fd11c282
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -0,0 +1,33 @@
+; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported call to function external_function in test_call_external
+
+
+declare i32 @external_function(i32) nounwind
+
+define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %c = call i32 @external_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define i32 @defined_function(i32 %x) nounwind noinline {
+  %y = add i32 %x, 8
+  ret i32 %y
+}
+
+define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %c = call i32 @defined_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/call_fs.ll b/test/CodeGen/AMDGPU/call_fs.ll
new file mode 100644
index 000000000000..87bebbc49d52
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call_fs.ll
@@ -0,0 +1,17 @@
+
+; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s
+
+; EG: .long 257
+; EG: {{^}}call_fs:
+; EG: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
+; R600: .long 257
+; R600: {{^}}call_fs:
+; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
+
+
+define void @call_fs() #0 {
+  ret void
+}
+
+attributes #0 = { "ShaderType"="1" } ; Vertex Shader
diff --git a/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
new file mode 100644
index 000000000000..c7b8c4037316
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cayman-loop-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: LOOP_START_DX10
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: LOOP_START_DX10
+; CHECK: PUSH
+; CHECK-NOT: ALU_PUSH_BEFORE
+; CHECK: END_LOOP
+; CHECK: END_LOOP
+define void @main (<4 x float> inreg %reg0) #0 {
+entry:
+  br label %outer_loop
+outer_loop:
+  %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop]
+  %cond = icmp eq i32 %cnt, 16
+  br i1 %cond, label %outer_loop_body, label %exit
+outer_loop_body:
+  %cnt_incr = add i32 %cnt, 1
+  br label %inner_loop
+inner_loop:
+  %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body]
+  %cond2 = icmp eq i32 %cnt2, 16
+  br i1 %cond, label %inner_loop_body, label %outer_loop
+inner_loop_body:
+  %cnt2_incr = add i32 %cnt2, 1
+  br label %inner_loop
+exit:
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll
new file mode 100644
index 000000000000..75b87e486226
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG64 %s < %t
+
+; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=BUG32 %s < %t
+
+; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC
+; RUN: FileCheck --check-prefix=NOBUG %s < %t
+
+; REQUIRES: asserts
+
+; We are currently allocating 2 extra sub-entries on Evergreen / NI for
+; non-WQM push instructions if we change this to 1, then we will need to
+; add one level of depth to each of these tests.
+
+; BUG64-NOT: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested3:
+define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.store.1
+
+if.store.1:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested4:
+define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested7:
+define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: {{^}}nested8:
+define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  %7 = icmp sgt i32 %cond, 70
+  br i1 %7, label %if.8, label %if.7.store
+
+if.7.store:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+if.8:
+  store i32 8, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll
new file mode 100644
index 000000000000..c74ee22868d5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cf_end.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s
+
+; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
+; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
+define void @eop() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
new file mode 100644
index 000000000000..77f7bd01b7f0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -0,0 +1,242 @@
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; OPT-LABEL: @test_sink_global_small_offset_i32(
+; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in
+; OPT: br i1
+; OPT: ptrtoint
+
+; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
+; GCN: {{^}}BB0_2:
+define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
+; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+; OPT: br i1
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: {{^}}BB1_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
+; GCN: {{^}}BB2_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: {{^}}BB3_2:
+; GCN: s_or_b64 exec
+define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
+  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp2 = sext i8 %tmp1 to i32
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
+; OPT: getelementptr i32, i32 addrspace(4)* %in
+; OPT: br i1
+; OPT-NOT: ptrtoint
+
+; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
+; GCN: flat_load_dword
+; GCN: {{^}}BB4_2:
+
+define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(4)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_scratch_small_offset_i32(
+; OPT-NOT:  getelementptr [512 x i32]
+; OPT: br i1
+; OPT: ptrtoint
+
+; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: {{^}}BB5_2:
+define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
+; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
+; OPT: br i1
+; OPT-NOT: ptrtoint
+
+; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
+; GCN: {{^}}BB6_2:
+define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
+; GCN: s_and_saveexec_b64
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN: {{^}}BB7_2:
+define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) {
+entry:
+  %offset.ext = zext i32 %offset to i64
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
new file mode 100644
index 000000000000..96730bcf2e8f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s
+
+declare float @llvm.fma.f32(float, float, float)
+
+; This checks that rematerialization support of the coalescer does not
+; unnecessarily widen the register class. Without those fixes > 20 VGprs
+; are used here
+; Also check that some rematerialization of the 0 constant happened.
+; CHECK-LABEL: foobar
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
+; It's probably OK if this is slightly higher:
+; CHECK: ; NumVgprs: 9
+define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+entry:
+  %cmpflag = icmp eq i32 %flag, 1
+  br i1 %cmpflag, label %loop, label %exit
+
+loop:
+  %c = phi i32 [0, %entry], [%cnext, %loop]
+  %v0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %v1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %v2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %v3 = phi float [0.0, %entry], [%fma.3, %loop]
+
+  ; Try to get the 0 constant to get coalesced into a wide register
+  %blup = insertelement <4 x float> undef, float %v0, i32 0
+  store <4 x float> %blup, <4 x float> addrspace(1)* %out
+
+  %load = load <4 x float>, <4 x float> addrspace(1)* %in
+  %load.0 = extractelement <4 x float> %load, i32 0
+  %load.1 = extractelement <4 x float> %load, i32 1
+  %load.2 = extractelement <4 x float> %load, i32 2
+  %load.3 = extractelement <4 x float> %load, i32 3
+  %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0)
+  %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1)
+  %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2)
+  %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3)
+
+  %cnext = add nsw i32 %c, 1
+  %cmp = icmp eq i32 %cnext, 42
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  %ev0 = phi float [0.0, %entry], [%fma.0, %loop]
+  %ev1 = phi float [0.0, %entry], [%fma.1, %loop]
+  %ev2 = phi float [0.0, %entry], [%fma.2, %loop]
+  %ev3 = phi float [0.0, %entry], [%fma.3, %loop]
+  %dst.0 = insertelement <4 x float> undef,  float %ev0, i32 0
+  %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1
+  %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2
+  %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3
+  store <4 x float> %dst.3, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
new file mode 100644
index 000000000000..585172092676
--- /dev/null
+++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -0,0 +1,18 @@
+; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s
+
+; OPT-LABEL: @test(
+; OPT: mul nsw i32
+; OPT-NEXT: sext
+
+; SI-LLC-LABEL: {{^}}test:
+; SI-LLC: s_mul_i32
+; SI-LLC-NOT: mul
+define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+entry:
+  %0 = mul nsw i32 %a, 3
+  %1 = sext i32 %0 to i64
+  %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1
+  store i8 %b, i8 addrspace(1)* %2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll
new file mode 100644
index 000000000000..01572afa6205
--- /dev/null
+++ b/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+;
+; kernel void combine_vloads(global char8* src, global char8* result) {
+;   for (int i = 0; i < 1024; ++i)
+;     result[i] = src[0] + src[1] + src[2] + src[3];
+; }
+;
+
+
+; 128-bit loads instead of many 8-bit
+; EG-LABEL: {{^}}combine_vloads:
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+entry:
+  br label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
+  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
+  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
+  %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32
+  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
+  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
+  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
+  %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01
+  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
+  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %tmp19 = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %tmp19, 1024
+  br i1 %exitcond, label %for.exit, label %for.body
+}
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
new file mode 100644
index 000000000000..31766047a358
--- /dev/null
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -0,0 +1,697 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+; --------------------------------------------------------------------------------
+; i32 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i32:
+; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp eq i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i32:
+; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Why isn't this being folded as a constant?
+; GCN-LABEL: {{^}}commute_ne_litk_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ne i32 %val, 12345
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ugt i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i32:
+; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
+define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp uge i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ult i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i32:
+; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
+define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
+define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp ule i32 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
+define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sgt i32 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i32:
+; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
+define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sge i32 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
+define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp slt i32 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i32:
+; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
+define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in
+  %cmp = icmp sle i32 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; i64 compares
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}commute_eq_64_i64:
+; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp eq i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ne_64_i64:
+; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ne i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ugt i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_64_i64:
+; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp uge i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_64_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ult i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_63_i64:
+; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 63
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
+
+; GCN-LABEL: {{^}}commute_ule_64_i64:
+; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
+; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp ule i64 %val, 64
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sgt i64 %val, -1
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sge_neg2_i64:
+; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sge i64 %val, -2
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_slt_neg16_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp slt i64 %val, -16
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_sle_5_i64:
+; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep.in
+  %cmp = icmp sle i64 %val, 5
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f32 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
+; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oeq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
+; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ogt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f32:
+; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp oge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f32:
+; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp olt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f32:
+; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ole float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f32:
+; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp one float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f32:
+; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ord float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
+; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ueq float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
+; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ugt float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f32:
+; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uge float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f32:
+; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ult float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f32:
+; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp ule float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f32:
+; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
+define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp une float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f32:
+; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
+define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load float, float addrspace(1)* %gep.in
+  %cmp = fcmp uno float %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; f64 compares
+; --------------------------------------------------------------------------------
+
+
+; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
+; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oeq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
+; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ogt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_oge_2.0_f64:
+; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp oge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_olt_2.0_f64:
+; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp olt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ole_2.0_f64:
+; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ole double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_one_2.0_f64:
+; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp one double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ord_2.0_f64:
+; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ord double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
+; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ueq double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
+; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ugt double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uge_2.0_f64:
+; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uge double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ult_2.0_f64:
+; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ult double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_ule_2.0_f64:
+; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp ule double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_une_2.0_f64:
+; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
+define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp une double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_uno_2.0_f64:
+; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
+define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %val = load double, double addrspace(1)* %gep.in
+  %cmp = fcmp uno double %val, 2.0
+  %ext = sext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
new file mode 100644
index 000000000000..7fc36eabb780
--- /dev/null
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -0,0 +1,181 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: @commute_add_imm_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 2.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
+  %z = fmul float 4.0, %x.fneg.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fneg = fsub float -0.000000e+00, %x
+  %z = fmul float 4.0, %x.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %x = load float, float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 1024.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fadd float %x, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fneg = fsub float -0.000000e+00, %y
+  %z = fmul float %x, %y.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fmul float %x.fabs, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x.fabs, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure we commute the multiply part for the constant in src0 even
+; though we have negate modifier on src2.
+
+; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r2.fabs = call float @llvm.fabs.f32(float %r2)
+
+  %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/complex-folding.ll b/test/CodeGen/AMDGPU/complex-folding.ll
new file mode 100644
index 000000000000..a5399a71324c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/complex-folding.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main:
+; CHECK-NOT: MOV
+define void @main(<4 x float> inreg %reg0) #0 {
+entry:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = call float @fabs(float %0)
+  %2 = fptoui float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+  ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll
new file mode 100644
index 000000000000..a09ed1f73857
--- /dev/null
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -0,0 +1,296 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_concat_v1i32:
+; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
+; instructions that access scratch memory.  Bit 23, which is the add_tid_enable
+; bit, is only set for scratch access, so we can check for the absence of this
+; value if we want to ensure scratch memory is not being used.
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v32i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}concat_vector_crash:
+; SI: s_endpgm
+define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+bb:
+  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
+  %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
new file mode 100644
index 000000000000..8b397566066a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_copy_v4i8:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; After scalarizing v4i8 loads is fixed.
+; XSI: buffer_load_dword
+; XSI: V_BFE
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; XSI: buffer_load_dword
+; XSI: BFE
+; XSI: buffer_store_dword
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI-NEXT: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8:
+; SI-NOT: bfe
+; SI-NOT: bfi
+; SI: s_endpgm
+define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll
new file mode 100644
index 000000000000..fc875f6ef7a3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that CopyToReg instructions don't have non-register operands prior
+; to being emitted.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
+define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %alloca = alloca [16 x i32]
+  br label %loop
+
+loop:
+  %inc = phi i32 [0, %entry], [%inc.i, %loop]
+  %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc
+  store i32 %inc, i32* %ptr
+  %inc.i = add i32 %inc, 1
+  %cnd = icmp uge i32 %inc.i, 16
+  br i1 %cnd, label %done, label %loop
+
+done:
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
new file mode 100644
index 000000000000..bd26c302fe5a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
new file mode 100644
index 000000000000..0a031c5e24d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -0,0 +1,300 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctpop_i32:
+; GCN: s_load_dword [[SVAL:s[0-9]+]],
+; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Why 0 in register?
+; FUNC-LABEL: {{^}}v_ctpop_i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
+; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
+  %val1 = load i32, i32 addrspace(1)* %in1, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
+  %add = add i32 %ctpop0, %ctpop1
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
+; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+  %val0 = load i32, i32 addrspace(1)* %in0, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %add = add i32 %ctpop0, %sval
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
+  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
+  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v4i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
+  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
+  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v8i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v16i32:
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: v_bcnt_u32_b32_e64
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+; EG: BCNT_INT
+define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
+  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
+  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 4
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 4, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, 99999
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %ctpop, %const
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
+; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BCNT_INT
+define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
+  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
+  %const = load i32, i32 addrspace(1)* %gep, align 4
+  %add = add i32 %const, %ctpop
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: {{^}}ctpop_i32_in_br:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
+; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+; EG: BCNT_INT
+define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
+
+if:
+  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
+  br label %endif
+
+else:
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %tmp4 = load i32, i32 addrspace(1)* %tmp3
+  br label %endif
+
+endif:
+  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
new file mode 100644
index 000000000000..e1a0ee3ea217
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -0,0 +1,124 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
+declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_ctpop_i64:
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; GCN: buffer_store_dword [[VRESULT]],
+; GCN: s_endpgm
+define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_i64:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
+  %truncctpop = trunc i64 %ctpop to i32
+  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_v2i64:
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
+define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ctpop_v4i64:
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_bcnt1_i32_b64
+; GCN: s_endpgm
+define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i64:
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
+define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
+  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
+  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v4i64:
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: v_bcnt_u32_b32
+; GCN: s_endpgm
+define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
+  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
+  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
+  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: We currently disallow SALU instructions in all branches,
+; but there are some cases when the should be allowed.
+
+; FUNC-LABEL: {{^}}ctpop_i64_in_br:
+; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
+; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; GCN: s_endpgm
+define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
+
+if:
+  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
+  br label %endif
+
+else:
+  %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1
+  %tmp4 = load i64, i64 addrspace(1)* %tmp3
+  br label %endif
+
+endif:
+  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
+  store i64 %tmp5, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
new file mode 100644
index 000000000000..56fcb51fe14e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
+  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
+  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
+  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
+  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
new file mode 100644
index 000000000000..3399d9da29e3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -0,0 +1,196 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}load_i8_to_f32:
+; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dword [[CONV]],
+define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in, align 1
+  %cvt = uitofp i8 %load to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v2i8_to_v2f32:
+; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-NOT: and
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
+  %cvt = uitofp <2 x i8> %load to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v3i8_to_v3f32:
+; SI-NOT: bfe
+; SI-NOT: v_cvt_f32_ubyte3_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <3 x i8> %load to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This should not be adding instructions to shift into the correct
+; position in the word for the component.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
+; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
+; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
+; SI-NOT: v_lshlrev_b32
+; SI-NOT: v_or_b32
+
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]]
+
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; XXX - This should really still be able to use the v_cvt_f32_ubyte0
+; for each component, but computeKnownBits doesn't handle vectors very
+; well.
+
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+
+; XXX - replace with this when v4i8 loads aren't scalarized anymore.
+; XSI: buffer_load_dword
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
+  %cvt = uitofp <4 x i8> %load to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; Make sure this doesn't crash.
+; SI-LABEL: {{^}}load_v7i8_to_v7f32:
+; SI: s_endpgm
+define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
+  %cvt = uitofp <7 x i8> %load to <7 x float>
+  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}load_v8i8_to_v8f32:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
+  %cvt = uitofp <8 x i8> %load to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
+; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: buffer_store_dword [[CONV]],
+define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 2
+  %inreg = and i32 %add, 255
+  %cvt = uitofp i32 %inreg to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
+define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %inreg = and i32 %load, 65280
+  %shr = lshr i32 %inreg, 8
+  %cvt = uitofp i32 %shr to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; We don't get these ones because of the zext, but instcombine removes
+; them so it shouldn't really matter.
+define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in, align 1
+  %ext = zext i8 %load to i32
+  %cvt = uitofp i32 %ext to float
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
+  %ext = zext <4 x i8> %load to <4 x i32>
+  %cvt = uitofp <4 x i32> %ext to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
new file mode 100644
index 000000000000..2dd3a9f2a776
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0:
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NOT: add
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1:
+; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 1.0
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %floor = call float @llvm.floor.f32(float %x.fabs) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %floor = call float @llvm.floor.f32(float %x.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg:
+; SI-NOT: add
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+; SI: s_endpgm
+define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0:
+; SI-NOT: v_cvt_flr_i32_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %floor = call float @llvm.floor.f32(float %x) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
new file mode 100644
index 000000000000..864ac40260b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %fadd = fadd float %x.fabs, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work because it forms fsub 0.5, x
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg:
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}}
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fneg = fsub float -0.000000e+00, %x
+  %fadd = fadd float %x.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This doesn't work for same reason as above
+; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg:
+; SI-SAFE-NOT: v_cvt_rpi_i32_f32
+; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
+
+; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}|
+; SI-SAFE-NOT: v_cvt_flr_i32_f32
+; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
+; SI: s_endpgm
+define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
+  %fadd = fadd float %x.fabs.fneg, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptosi float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0:
+; SI-NOT: v_cvt_rpi_i32_f32
+; SI: v_add_f32
+; SI: v_floor_f32
+; SI: v_cvt_u32_f32
+; SI: s_endpgm
+define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+  %fadd = fadd float %x, 0.5
+  %floor = call float @llvm.floor.f32(float %fadd) #1
+  %cvt = fptoui float %floor to i32
+  store i32 %cvt, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
new file mode 100644
index 000000000000..fb43ff4fbddd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -0,0 +1,36 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test is for a bug in
+; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
+; the wrong type was being passed to
+; TargetLowering::getOperationAction() when checking the legality of
+; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
+
+
+; CHECK: {{^}}sint:
+; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %sint = load i32, i32 addrspace(1) * %in
+  %conv = sitofp i32 %sint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+;CHECK: {{^}}uint:
+;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %uint = load i32, i32 addrspace(1) * %in
+  %conv = uitofp i32 %uint to float
+  %0 = insertelement <4 x float> undef, float %conv, i32 0
+  %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll
new file mode 100644
index 000000000000..a2e0e878b740
--- /dev/null
+++ b/test/CodeGen/AMDGPU/debug.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; Test for a crash in the custom assembly dump code.
+
+; SI: s_endpgm
+define void @test(i32 addrspace(1)* %out) {
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
new file mode 100644
index 000000000000..da8e91454b98
--- /dev/null
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_kernel:
+
+; DEFAULT: FloatMode: 192
+; DEFAULT: IeeeMode: 0
+
+; FP64-DENORMAL: FloatMode: 192
+; FP64-DENORMAL: IeeeMode: 0
+
+; FP32-DENORMAL: FloatMode: 48
+; FP32-DENORMAL: IeeeMode: 0
+
+; BOTH-DENORMAL: FloatMode: 240
+; BOTH-DENORMAL: IeeeMode: 0
+
+; NO-DENORMAL: FloatMode: 0
+; NO-DENORMAL: IeeeMode: 0
+define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
new file mode 100644
index 000000000000..cdd2c0cd4f43
--- /dev/null
+++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; PRED_SET* instructions must be tied to any instruction that uses their
+; result.  This tests that there are no instructions between the PRED_SET*
+; and the PREDICATE_BREAK in this loop.
+
+; CHECK: {{^}}loop_ge:
+; CHECK: LOOP_START_DX10
+; CHECK: ALU_PUSH_BEFORE
+; CHECK-NEXT: JUMP
+; CHECK-NEXT: LOOP_BREAK
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+entry:
+  %cmp5 = icmp sgt i32 %iterations, 0
+  br i1 %cmp5, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
+  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.07 = add nsw i32 %i.07.in, -1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
+  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %ai.06, 1
+  %exitcond = icmp eq i32 %add, %iterations
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/dot4-folding.ll b/test/CodeGen/AMDGPU/dot4-folding.ll
new file mode 100644
index 000000000000..4df7b63bf98e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/dot4-folding.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Exactly one constant vector can be folded into dot4, which means exactly
+; 4 MOV instructions
+; CHECK: {{^}}main:
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+; CHECK-NOT: MOV
+
+define void @main(float addrspace(1)* %out) {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1)
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
new file mode 100644
index 000000000000..e7e13d6178c4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local() #1
+
+; Function Attrs: nounwind
+; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
+; CHECK: BB0_1:
+; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
+
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
+; CHECK: s_endpgm
+define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %mul = shl nsw i32 %x.i, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
+  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.AMDGPU.barrier.local() #1
+  %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float, float addrspace(3)* %arrayidx, align 4
+  %add1 = add nsw i32 %offset.02, 1
+  %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4
+  %add3 = add nsw i32 %offset.02, 32
+  %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4
+  %add5 = add nsw i32 %offset.02, 33
+  %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4
+  %add7 = add nsw i32 %offset.02, 64
+  %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4
+  %add9 = fadd float %tmp, %tmp1
+  %add10 = fadd float %add9, %tmp2
+  %add11 = fadd float %add10, %tmp3
+  %add12 = fadd float %add11, %tmp4
+  %add13 = fadd float %sum.03, %add12
+  %inc = add nsw i32 %k.01, 1
+  %add14 = add nsw i32 %offset.02, 97
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %tmp5 = sext i32 %x.i to i64
+  %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5
+  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
new file mode 100644
index 000000000000..5929898f8bd8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -0,0 +1,515 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; FIXME: We don't get cases where the address was an SGPR because we
+; get a copy to the address register for each one.
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+ @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+; SI-LABEL: @simple_read2_f32
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_max_offset
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_too_far
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_x2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure there is an instruction between the two sets of reads.
+; SI-LABEL: @simple_read2_f32_x2_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
+; SI: s_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  call void @llvm.AMDGPU.barrier.local() #2
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; For some reason adding something to the base address for the first
+; element results in only folding the inner pair.
+
+; SI-LABEL: @simple_read2_f32_x2_nonzero_base
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float, float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Be careful of vectors of pointers. We don't know if the 2 pointers
+; in the vectors are really the same base, so this is not safe to
+; merge.
+; Base pointers come from different subregister of same super
+; register. We can't safely merge this.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Apply a constant scalar offset after the pointer vector extract.  We
+; are rejecting merges that have the same, constant 0 offset, so make
+; sure we are really rejecting it because of the different
+; subregisters.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
+
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We should be able to merge in this case, but probably not worth the effort.
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
+  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
+  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
+  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
+  %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float, float addrspace(3)* %gep.0, align 4
+  %val1 = load float, float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_0
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_1
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Can't fold since not correctly aligned.
+; XXX: This isn't really testing anything useful now. I think CI
+; allows unaligned LDS accesses, which would be a problem here.
+; SI-LABEL: @unaligned_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 1
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 1
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @misaligned_2_simple_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 2
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 2
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_max_offset
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
+; SI: s_endpgm
+define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_too_far
+; SI-NOT ds_read2_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
+; SI: s_endpgm
+define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only 4
+; SI-LABEL: @misaligned_read2_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @load_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_constant_disjoint_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; SI: s_endpgm
+define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
+  %sum.0 = fadd float %tmp16, %tmp17
+  %sum.1 = fadd float %sum.0, %tmp18
+  %sum.2 = fadd float %sum.1, %tmp19
+  %sum.3 = fadd float %sum.2, %tmp20
+  %sum.4 = fadd float %sum.3, %tmp21
+  %sum.5 = fadd float %sum.4, %tmp22
+  %sum.6 = fadd float %sum.5, %tmp23
+  %sum.7 = fadd float %sum.6, %tmp24
+  %sum.8 = fadd float %sum.7, %tmp25
+  store float %sum.8, float addrspace(1)* %C, align 4
+  ret void
+}
+
+define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+  %load = load i64, i64 addrspace(3)* %in, align 4
+  store i64 %load, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
new file mode 100644
index 000000000000..9ea9a5a2617b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; XFAIL: *
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+; SI-LABEL: {{^}}offset_order:
+
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1
+
+define void @offset_order(float addrspace(1)* %out) {
+entry:
+  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val0 = load float, float addrspace(3)* %ptr0
+
+  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
+  %val1 = load float, float addrspace(3)* %ptr1
+  %add1 = fadd float %val0, %val1
+
+  %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
+  %val2 = load float, float addrspace(3)* %ptr2
+  %add2 = fadd float %add1, %val2
+
+  %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
+  %val3 = load float, float addrspace(3)* %ptr3
+  %add3 = fadd float %add2, %val3
+
+  %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
+  %val4 = load float, float addrspace(3)* %ptr4
+  %add4 = fadd float %add3, %val4
+
+  %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
+  %val5 = load float, float addrspace(3)* %ptr5
+  %add5 = fadd float %add4, %val5
+
+  %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
+  %val6 = load float, float addrspace(3)* %ptr6
+  %add6 = fadd float %add5, %val6
+  store float %add6, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
new file mode 100644
index 000000000000..54b3b45636d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -0,0 +1,272 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_read2st64_f32_0_1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_1_2
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_max_offset
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_over_max_offset
+; SI-NOT: ds_read2st64_b32
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16384
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_0
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 63
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_1
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 127
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_0_1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_1_2
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only
+
+; SI-LABEL: @misaligned_read2st64_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
+; SI: s_endpgm
+define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
+; SI-LABEL: @simple_read2st64_f64_max_offset
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_over_max_offset
+; SI-NOT: ds_read2st64_b64
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8192
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @invalid_read2st64_f64_odd_offset
+; SI-NOT: ds_read2st64_b64
+; SI: s_endpgm
+define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8129
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
+; stride in elements, not bytes, is a multiple of 64.
+
+; SI-LABEL: @byte_size_only_divisible_64_read2_f64
+; SI-NOT: ds_read2st_b64
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double, double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
new file mode 100644
index 000000000000..b553d3459e40
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -0,0 +1,425 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_write2_one_val_f32
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 
+; SI: s_endpgm
+define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_0
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_1
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; 2 data subregisters from different super registers.
+; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
+; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
+  %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
+  %val0.0 = extractelement <2 x float> %val0, i32 0
+  %val1.1 = extractelement <2 x float> %val1, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg2_f32
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
+  %val0 = extractelement <2 x float> %val, i32 0
+  %val1 = extractelement <2 x float> %val, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg4_f32
+; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
+  %val0 = extractelement <4 x float> %val, i32 0
+  %val1 = extractelement <4 x float> %val, i32 3
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; SI: s_endpgm
+define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_too_far_f32
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 3
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
+define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float, float addrspace(1)* %in0.gep, align 4
+  %val1 = load float, float addrspace(1)* %in1.gep, align 4
+
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
+  store float %val0, float addrspace(3)* %gep.0, align 4
+
+  %add.x = add nsw i32 %x.i, 8
+  store float %val1, float addrspace(3)* %gep.1.offset, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_one_val_f64
+; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @misaligned_simple_write2_one_val_f64
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
+; SI: s_endpgm
+define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @store_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+define void @store_constant_adjacent_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  ret void
+}
+
+; SI-LABEL: @store_constant_disjoint_offsets
+; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
+; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+define void @store_constant_disjoint_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+define void @store_misaligned64_constant_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: s_endpgm
+define void @store_misaligned64_constant_large_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %val = load float, float addrspace(1)* %in
+  %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  store float %val, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  store float %val, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  store float %val, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  store float %val, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  store float %val, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  store float %val, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  store float %val, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  store float %val, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  store float %val, float addrspace(3)* %arrayidx80, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
new file mode 100644
index 000000000000..1d9d881c5c7e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -0,0 +1,119 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+
+; SI-LABEL: @simple_write2st64_one_val_f32_0_1
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
+; SI: s_endpgm
+define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %val = load float, float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_f32_2_5
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
+; SI: s_endpgm
+define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %add.x.0 = add nsw i32 %x.i, 128
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 320
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float, float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float, float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double, double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double, double addrspace(1)* %in.gep.1, align 8
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
+; SI-NOT: ds_write2st64_b64
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
+  %val = load double, double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
new file mode 100644
index 000000000000..d0fd06a34379
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+
+; Test that we don't try to produce a COFF file on windows
+; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s
+
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
+; ELF: Type: SHT_PROGBITS
+
+; ELF: Symbol {
+; ELF: Name: test
+; ELF: Binding: Global
+
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   45096
+; TYPICAL-NEXT: .long   0
+; TONGA-NEXT: .long   576
+; CONFIG: .align 256
+; CONFIG: test:
+define void @test(i32 %p) #0 {
+   %i = add i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll
new file mode 100644
index 000000000000..51cd08500932
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf.r600.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s
+
+; ELF: Format: ELF32
+; ELF: Name: .AMDGPU.config
+
+; CONFIG: .section .AMDGPU.config
+; CONFIG-NEXT: .long   166100
+; CONFIG-NEXT: .long   2
+; CONFIG-NEXT: .long   165900
+; CONFIG-NEXT: .long   0
+define void @test(float addrspace(1)* %out, i32 %p) {
+   %i = add i32 %p, 2
+   %r = bitcast i32 %i to float
+   store float %r, float addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll
new file mode 100644
index 000000000000..a060900811ea
--- /dev/null
+++ b/test/CodeGen/AMDGPU/empty-function.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure we don't assert on empty functions
+
+; SI: .text
+; SI-LABEL: {{^}}empty_function_ret:
+; SI: s_endpgm
+; SI: codeLenInByte = 4
+define void @empty_function_ret() #0 {
+  ret void
+}
+
+; SI: .text
+; SI-LABEL: {{^}}empty_function_unreachable:
+; SI: codeLenInByte = 0
+define void @empty_function_unreachable() #0 {
+  unreachable
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
new file mode 100644
index 000000000000..267a323c5063
--- /dev/null
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; This tests that the llvm.SI.end.cf intrinsic is not inserted into the
+; loop block.  This intrinsic will be lowered to s_or_b64 by the code
+; generator.
+
+; CHECK-LABEL: {{^}}test:
+
+; This is was lowered from the llvm.SI.end.cf intrinsic:
+; CHECK: s_or_b64 exec, exec
+
+; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
+; CHECK-NOT: s_or_b64 exec, exec
+; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %loop
+
+if:
+  store i32 0, i32 addrspace(1)* %out
+  br label %loop
+
+loop:
+  %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop]
+  %inc = add i32 %tmp1, %cond
+  %tmp2 = icmp ugt i32 %inc, 10
+  br i1 %tmp2, label %done, label %loop
+
+done:
+  %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1
+  store i32 %inc, i32 addrspace(1)* %tmp3
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
new file mode 100644
index 000000000000..294c3a9c6782
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}load_i8_sext_private:
+; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8, i8* %tmp0
+  %tmp2 = sext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_zext_private:
+; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = load i8, i8* %tmp0
+  %tmp2 = zext i8 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext_private:
+; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16, i16* %tmp0
+  %tmp2 = sext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_zext_private:
+; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca i16
+  %tmp1 = load i16, i16* %tmp0
+  %tmp2 = zext i16 %tmp1 to i32
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
new file mode 100644
index 000000000000..662eb7a9716b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}anyext_load_i8:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
+define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
+  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_i16:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
+; EG: VTX_READ_32 [[VAL]]
+
+define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32, i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
+  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_lds_i8:
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
+define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
+  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}anyext_load_lds_i16:
+; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
+; EG: LDS_WRITE * [[VAL]]
+define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32, i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
+  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
new file mode 100644
index 000000000000..c7572efc6f5b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
+  %p0 = extractelement <2 x i16> %foo, i32 0
+  %p1 = extractelement <2 x i16> %foo, i32 1
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
+  %p0 = extractelement <4 x i16> %foo, i32 0
+  %p1 = extractelement <4 x i16> %foo, i32 2
+  %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
new file mode 100644
index 000000000000..3c6136c1a7bd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -0,0 +1,97 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
+
+; FUNC-LABEL: {{^}}v_fabs_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tidext = sext i32 %tid to i64
+  %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
+  %val = load double, double addrspace(1)* %gep, align 8
+  %fabs = call double @llvm.fabs.f64(double %val)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f64:
+; SI: v_and_b32
+; SI-NOT: v_and_b32
+; SI: s_endpgm
+define void @fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  store <4 x double> %fabs, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @llvm.fabs.f64(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fn_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @fabs(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_fn_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
new file mode 100644
index 000000000000..419a73d02669
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -0,0 +1,101 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; FUNC-LABEL: {{^}}fabs_fn_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; GCN: v_and_b32
+
+define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; GCN: v_and_b32
+
+define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+define void @fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+; GCN: v_and_b32
+define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_and_b32
+define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %fabs, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fn_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @fabs(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: and
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @llvm.fabs.f32(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
new file mode 100644
index 000000000000..5fac328c5981
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}fadd_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_add_f32
+define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+   %add = fadd float %a, %b
+   store float %add, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+  %add = fadd <2 x float> %a, %b
+  store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
+  %result = fadd <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fadd_v8f32:
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
+  %add = fadd <8 x float> %a, %b
+  store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
new file mode 100644
index 000000000000..485c55870c47
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}fadd_f64:
+; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
+
+define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fadd double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll
new file mode 100644
index 000000000000..f23e8919d733
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fceil.ll
@@ -0,0 +1,132 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.ceil.f32(float) nounwind readnone
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
+
+; FUNC-LABEL: {{^}}fceil_f32:
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.ceil.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v2f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v3f32:
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+  %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
+  store <3 x float> %y, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v4f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+; EG: CEIL {{\*? *}}[[RESULT]]
+define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v8f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v16f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}}
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT3]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
+define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
new file mode 100644
index 000000000000..e8c34f0141e4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}fceil_f64:
+; CI: v_ceil_f64_e32
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI-DAG: v_cmp_lt_f64
+; SI-DAG: v_cmp_lg_f64
+; SI: s_and_b64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: v_add_f64
+; SI: s_endpgm
+define void @fceil_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.ceil.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v2f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}fceil_v3f64:
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}fceil_v4f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v8f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fceil_v16f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll
new file mode 100644
index 000000000000..530274f920f0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -0,0 +1,14 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 2, i32 3 
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
new file mode 100644
index 000000000000..c402805feb39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
+; chance to optimize the fcmp + select instructions to SET* was missed
+; due to the fact that the operands to fcmp and select had different types
+
+; CHECK: SET{{[A-Z]+}}_DX10
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 -1, i32 0
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll
new file mode 100644
index 000000000000..5207ab57bade
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}fcmp_sext:
+; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
+  %1 = load float, float addrspace(1)* %arrayidx1
+  %cmp = fcmp oeq float %0, %1
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test checks that a setcc node with f32 operands is lowered to a
+; SET*_DX10 instruction.  Previously we were lowering this to:
+; SET* + FP_TO_SINT
+
+; CHECK: {{^}}fcmp_br:
+; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
+; CHECK-NEXT {{[0-9]+(5.0}}
+
+define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
new file mode 100644
index 000000000000..053ab0ed7aaf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}flt_f64:
+; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ult double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fle_f64:
+; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ule double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fgt_f64:
+; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ugt double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fge_f64:
+; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fne_f64:
+; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp une double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}feq_f64:
+; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fcmp ueq double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
new file mode 100644
index 000000000000..89af37545c99
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}fconst_f64:
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
+
+define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r1 = load double, double addrspace(1)* %in
+   %r2 = fadd double %r1, 5.000000e+00
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll
new file mode 100644
index 000000000000..b719d5a39785
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+; Try to identify arg based on higher address.
+; FUNC-LABEL: {{^}}test_copysign_f32:
+; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
+; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
+; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+
+; EG: BFI_INT
+define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+  %result = call float @llvm.copysign.f32(float %mag, float %sign)
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v2f32:
+; GCN: s_endpgm
+
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+  %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
+  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v4f32:
+; GCN: s_endpgm
+
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+; EG: BFI_INT
+define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+  %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
new file mode 100644
index 000000000000..3d8c55993089
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+declare double @llvm.copysign.f64(double, double) nounwind readnone
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_copysign_f64:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; GCN: s_endpgm
+define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+  %result = call double @llvm.copysign.f64(double %mag, double %sign)
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v2f64:
+; GCN: s_endpgm
+define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+  %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copysign_v4f64:
+; GCN: s_endpgm
+define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+  %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
new file mode 100644
index 000000000000..7c022e38c808
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+
+
+; COMMON-LABEL: {{^}}fdiv_f64:
+; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
+
+; Check for div_scale bug workaround on SI
+; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
+; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
+
+; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
+
+; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
+; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
+
+; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
+; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
+; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
+; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
+; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
+; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
+; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
+; COMMON: buffer_store_dwordx2 [[RESULT]]
+; COMMON: s_endpgm
+define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
+  %num = load double, double addrspace(1)* %in
+  %den = load double, double addrspace(1)* %gep.1
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_v:
+define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
+  %den = load double, double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_v_s:
+define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
+  %num = load double, double addrspace(1)* %in
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}fdiv_f64_s_s:
+define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
+  %result = fdiv double %num, %den
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v2f64:
+define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
+  %num = load <2 x double>, <2 x double> addrspace(1)* %in
+  %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v2f64:
+define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+  %result = fdiv <2 x double> %num, %den
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}v_fdiv_v4f64:
+define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
+  %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %num = load <4 x double>, <4 x double> addrspace(1)* %in
+  %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; COMMON-LABEL: {{^}}s_fdiv_v4f64:
+define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
+  %result = fdiv <4 x double> %num, %den
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
new file mode 100644
index 000000000000..7cbf87336399
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
+
+; FUNC-LABEL: {{^}}fdiv_f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+
+
+; FUNC-LABEL: {{^}}fdiv_v2f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_v4f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
new file mode 100644
index 000000000000..e7160ef5d726
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s
+
+; R600 supports 8 fetches in a clause
+; CHECK: {{^}}fetch_limits_r600:
+; CHECK: Fetch clause
+; CHECK: Fetch clause
+
+define void @fetch_limits_r600() #0 {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
+  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
+  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
+  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
+  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
+  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
+  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
+  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
+  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+  %a = fadd <4 x float> %res0, %res1
+  %b = fadd <4 x float> %res2, %res3
+  %c = fadd <4 x float> %res4, %res5
+  %d = fadd <4 x float> %res6, %res7
+  %e = fadd <4 x float> %res8, %a
+
+  %bc = fadd <4 x float> %b, %c
+  %de = fadd <4 x float> %d, %e
+
+  %bcde = fadd <4 x float> %bc, %de
+
+  call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
new file mode 100644
index 000000000000..acaea2aa7943
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; r700+ supports 16 fetches in a clause
+; CHECK: {{^}}fetch_limits_r700:
+; CHECK: Fetch clause
+; CHECK: Fetch clause
+
+define void @fetch_limits_r700() #0 {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
+  %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1)
+  %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1)
+  %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1)
+  %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1)
+  %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1)
+  %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1)
+  %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1)
+  %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1)
+  %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+  %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1)
+  %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1)
+  %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1)
+  %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1)
+  %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1)
+  %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1)
+  %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1)
+  %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1)
+  %a = fadd <4 x float> %res0, %res1
+  %b = fadd <4 x float> %res2, %res3
+  %c = fadd <4 x float> %res4, %res5
+  %d = fadd <4 x float> %res6, %res7
+  %e = fadd <4 x float> %res8, %res9
+  %f = fadd <4 x float> %res10, %res11
+  %g = fadd <4 x float> %res12, %res13
+  %h = fadd <4 x float> %res14, %res15
+  %i = fadd <4 x float> %res16, %a
+
+  %bc = fadd <4 x float> %b, %c
+  %de = fadd <4 x float> %d, %e
+  %fg = fadd <4 x float> %f, %g
+  %hi = fadd <4 x float> %h, %i
+
+  %bcde = fadd <4 x float> %bc, %de
+  %fghi = fadd <4 x float> %fg, %hi
+
+  %bcdefghi = fadd <4 x float> %bcde, %fghi
+  call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" } ; Pixel Shader
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
new file mode 100644
index 000000000000..45f8382c3929
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -0,0 +1,127 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.floor.f64(double) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
+; SI: v_fract_f64_e32
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64
+; SI: s_endpgm
+define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.floor.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
+; SI: s_endpgm
+define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+  %neg = fsub double 0.0, %x
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
+; CI: v_floor_f64_e64
+; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
+; SI: v_min_f64
+; SI: v_cmp_class_f64_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
+; SI: s_endpgm
+define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+  %abs = call double @llvm.fabs.f64(double %x)
+  %neg = fsub double 0.0, %abs
+  %y = call double @llvm.floor.f64(double %neg) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll
new file mode 100644
index 000000000000..61c46ac2bc03
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ffloor.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}floor_f32:
+; SI: v_floor_f32_e32
+; R600: FLOOR
+define void @floor_f32(float addrspace(1)* %out, float %in) {
+  %tmp = call float @llvm.floor.f32(float %in) #0
+  store float %tmp, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}floor_v2f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
+  store <2 x float> %tmp, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}floor_v4f32:
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+; SI: v_floor_f32_e32
+
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+; R600: FLOOR
+define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
+  store <4 x float> %tmp, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
new file mode 100644
index 000000000000..8ceca078f2d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -0,0 +1,184 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32, i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: {{^}}store_flat_i32:
+; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i64:
+; CHECK: flat_store_dwordx2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_v4i32:
+; CHECK: flat_store_dwordx4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i16:
+; CHECK: flat_store_short
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i8:
+; CHECK: flat_store_byte
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: flat_load_dword
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: flat_load_dwordx2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64, i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: flat_load_dwordx4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: flat_load_sbyte
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: flat_load_ubyte
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: flat_load_sshort
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: flat_load_ushort
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32, i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32, i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/floor.ll b/test/CodeGen/AMDGPU/floor.ll
new file mode 100644
index 000000000000..c6bfb8567a0f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/floor.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
+
+; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @floor(float %r0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @floor(float) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
new file mode 100644
index 000000000000..bd574b877117
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -0,0 +1,368 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare double @llvm.fabs.f64(double) #0
+declare double @llvm.fma.f64(double, double, double) #0
+declare float @llvm.fma.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fadd double %mul, %c
+  %fma1 = fadd double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fadd double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %mul, %c
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %mul, %c
+  %fma1 = fsub double %mul, %d
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %fma = fsub double %c, %mul
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %fma0 = fsub double %c, %mul
+  %fma1 = fsub double %d, %mul
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma = fsub double %mul.neg, %c
+
+  store double %fma, double addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul.neg, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
+; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
+; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI: s_endpgm
+define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0
+  %b = load double, double addrspace(1)* %gep.1
+  %c = load double, double addrspace(1)* %gep.2
+  %d = load double, double addrspace(1)* %gep.3
+
+  %mul = fmul double %a, %b
+  %mul.neg = fsub double -0.0, %mul
+  %fma0 = fsub double %mul.neg, %c
+  %fma1 = fsub double %mul, %d
+
+  store double %fma0, double addrspace(1)* %gep.out.0
+  store double %fma1, double addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
+; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
+; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
+; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
+; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
+
+  %x = load double, double addrspace(1)* %gep.0
+  %y = load double, double addrspace(1)* %gep.1
+  %z = load double, double addrspace(1)* %gep.2
+  %u = load double, double addrspace(1)* %gep.3
+  %v = load double, double addrspace(1)* %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub double %x, %tmp1
+
+  store double %tmp2, double addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
new file mode 100644
index 000000000000..0a55ef778557
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+
+; FUNC-LABEL: {{^}}fma_f64:
+; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3
+   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
+   store <2 x double> %r3, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3
+   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
+   store <4 x double> %r3, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
new file mode 100644
index 000000000000..d6024aa0b4c5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}fma_f32:
+; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG: FMA {{\*? *}}[[RES]]
+define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                     float addrspace(1)* %in2, float addrspace(1)* %in3) {
+  %r0 = load float, float addrspace(1)* %in1
+  %r1 = load float, float addrspace(1)* %in2
+  %r2 = load float, float addrspace(1)* %in3
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
+define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                       <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
+  %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3
+  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+  store <2 x float> %r3, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].X
+; EG-DAG: FMA {{\*? *}}[[RES]].Y
+; EG-DAG: FMA {{\*? *}}[[RES]].Z
+; EG-DAG: FMA {{\*? *}}[[RES]].W
+define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                       <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
+  %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3
+  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+  store <4 x float> %r3, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %c = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmad.ll b/test/CodeGen/AMDGPU/fmad.ll
new file mode 100644
index 000000000000..935e35123f45
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmad.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = extractelement <4 x float> %reg0, i32 2
+   %r3 = fmul float %r0, %r1
+   %r4 = fadd float %r3, %r2
+   %vec = insertelement <4 x float> undef, float %r4, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmax.ll b/test/CodeGen/AMDGPU/fmax.ll
new file mode 100644
index 000000000000..d7127f485c74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp oge float %r0, %r1
+   %r3 = select i1 %r2, float %r0, float %r1
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
new file mode 100644
index 000000000000..f78c71b28264
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.maxnum.f64(double, double) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_f64:
+; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8
+; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16
+; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+  %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
+  %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load double, double addrspace(1)* %bptr, align 8
+  %c = load double, double addrspace(1)* %cptr, align 8
+  %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone
+  %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone
+  store double %f1, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
new file mode 100644
index 000000000000..c3028a6217d5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_olt_0:
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmax
+; SI-LABEL: {{^}}test_fmax3_olt_1:
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
new file mode 100644
index 000000000000..828243888ac7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; Make sure we don't try to form FMAX_LEGACY nodes with f64
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f64
+define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp uge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f64
+define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp oge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f64
+define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ugt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f64
+define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ogt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
new file mode 100644
index 000000000000..413957d2982a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -0,0 +1,116 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; EG: MAX
+define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp oge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_max_
+; SI: v_cmp_gt_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_max_
+
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1addrspace(1)* %out1
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
new file mode 100644
index 000000000000..de563cec3412
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -0,0 +1,76 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.maxnum.f64(double, double) #0
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmax_f64
+; SI: v_max_f64
+define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.maxnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
new file mode 100644
index 000000000000..3029bd02e4db
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -0,0 +1,283 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
+
+declare double @llvm.maxnum.f64(double, double)
+
+; FUNC-LABEL: @test_fmax_f32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+; EG: MAX_DX10 {{.*}}[[OUT]]
+define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
+define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344(nan)
+define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_val_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_val
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_immediate_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MAX_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_immediate_var_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin.ll b/test/CodeGen/AMDGPU/fmin.ll
new file mode 100644
index 000000000000..defa8c09638a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
new file mode 100644
index 000000000000..0a76699b43e1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmin3_olt_0:
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmin
+; SI-LABEL: {{^}}test_fmin3_olt_1:
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float, float addrspace(1)* %aptr, align 4
+  %b = load float, float addrspace(1)* %bptr, align 4
+  %c = load float, float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
new file mode 100644
index 000000000000..e19a48f3f7e2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f64
+define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+   %r0 = extractelement <4 x double> %reg0, i32 0
+   %r1 = extractelement <4 x double> %reg0, i32 1
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = select i1 %r2, double %r1, double %r0
+   %vec = insertelement <4 x double> undef, double %r3, i32 0
+   store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f64
+define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ule double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f64
+define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ole double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f64
+define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp olt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f64
+define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ult double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
new file mode 100644
index 000000000000..6a625c239d76
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -0,0 +1,123 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math  -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f32
+; EG: MIN *
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-NONAN: v_min_f32_e32
+define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp olt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ult float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-NOT: v_min
+; SI: v_cmp_le_f32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val0 = select i1 %cmp, float %a, float %b
+  store float %val0, float addrspace(1)* %out0, align 4
+  store i1 %cmp, i1 addrspace(1)* %out1
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
new file mode 100644
index 000000000000..0f929d6a81f0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -0,0 +1,76 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.minnum.f64(double, double) #0
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmin_f64
+; SI: v_min_f64
+define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
new file mode 100644
index 000000000000..4d7b52540d85
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -0,0 +1,281 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.minnum.f32(float, float) #0
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
+
+; FUNC-LABEL: @test_fmin_f32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+; EG: MIN_DX10 {{.*}}[[OUT]]
+define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]]
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
+; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
+define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
+define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_val_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_val
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG-NOT: MIN_DX10
+; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
+define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_immediate_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_immediate_var_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
+; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
+define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
new file mode 100644
index 000000000000..addc409c9eb1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}fmul_f32:
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+
+; SI: v_mul_f32
+define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fmul float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+; FUNC-LABEL: {{^}}fmul_v2f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fmul <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1) * %in
+  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
+  %result = fmul <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI: s_endpgm
+define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 3.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k_inv:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI-NOT: v_mad_f32
+; SI: s_endpgm
+define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 2.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There should be three multiplies here; %a should be used twice (once
+; negated), not duplicated into mul x, 5.0 and mul x, -5.0.
+; FUNC-LABEL: {{^}}test_mul_twouse:
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+  %a = fmul float %x, 5.0
+  %b = fsub float -0.0, %a
+  %c = fmul float %b, %y
+  %d = fmul float %c, %a
+  store float %d, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
new file mode 100644
index 000000000000..3c222eaba89d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+; FUNC-LABEL: {{^}}fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = fmul double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v2f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) {
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
+   %r2 = fmul <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                        <4 x double> addrspace(1)* %in2) {
+   %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
+   %r2 = fmul <4 x double> %r0, %r1
+   store <4 x double> %r2, <4 x double> addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll
new file mode 100644
index 000000000000..ae84d841021d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmuladd.ll
@@ -0,0 +1,199 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: {{^}}fmuladd_f32:
+; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                         float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float, float addrspace(1)* %in1
+   %r1 = load float, float addrspace(1)* %in2
+   %r2 = load float, float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_f64:
+; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                         double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double, double addrspace(1)* %in1
+   %r1 = load double, double addrspace(1)* %in2
+   %r2 = load double, double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %add.0, %r1
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float, float addrspace(1)* %gep.0
+  %r1 = load float, float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %r1, %add.0
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %r2.fneg = fsub float -0.000000e+00, %r2
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll
new file mode 100644
index 000000000000..4fa9adaabdae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
+
+; This should have the exactly the same output as the test for rint,
+; so no need to check anything.
+
+declare float @llvm.nearbyint.f32(float) #0
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0
+declare double @llvm.nearbyint.f64(double) #0
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
+
+
+define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+entry:
+  %0 = call float @llvm.nearbyint.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+entry:
+  %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+entry:
+  %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.nearbyint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
new file mode 100644
index 000000000000..8830e8273661
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: Check something here. Currently it seems fabs + fneg aren't
+; into 2 modifiers, although theoretically that should work.
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+  %x = load double, double addrspace(1)* %xptr, align 8
+  %y = load double, double addrspace(1)* %xptr, align 8
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fmul = fmul double %y, %fsub
+  store double %fmul, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: s_load_dwordx2
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <2 x double> %fsub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <4 x double> %fsub, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
new file mode 100644
index 000000000000..3b4930d9897d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -0,0 +1,118 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
+; SI-NOT: and
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fadd = fadd float %y, %fsub
+  store float %fadd, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
+; SI-NOT: and
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI-NOT: and
+define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fmul = fmul float %y, %fsub
+  store float %fmul, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %val = load float, float addrspace(1)* %in, align 4
+  %fabs = call float @llvm.fabs.f32(float %val)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+
+; FIXME: SGPR should be used directly for first src operand.
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <2 x float> %fsub, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: SGPR should be used directly for first src operand.
+; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <4 x float> %fsub, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
new file mode 100644
index 000000000000..aa6df209035b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f64:
+; GCN: v_xor_b32
+define void @fneg_f64(double addrspace(1)* %out, double %in) {
+  %fneg = fsub double -0.000000e+00, %in
+  store double %fneg, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f64:
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+  %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
+  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f64:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+  %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
+  store <4 x double> %fneg, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f64:
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}}
+define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fsub = fsub double 0.0, %bc
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fold_f64:
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+  %fsub = fsub double -0.0, %in
+  %fmul = fmul double %fsub, %in
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll
new file mode 100644
index 000000000000..a0fd539863c6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fneg.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f32:
+; R600: -PV
+
+; GCN: v_xor_b32
+define void @fneg_f32(float addrspace(1)* %out, float %in) {
+  %fneg = fsub float -0.000000e+00, %in
+  store float %fneg, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f32:
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f32:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+; GCN: v_xor_b32
+define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
+  store <4 x float> %fneg, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f32:
+; R600-NOT: XOR
+; R600: -KC0[2].Z
+
+; XXX: We could use v_add_f32_e64 with the negate bit here instead.
+; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
+define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fsub = fsub float 0.0, %bc
+  store float %fsub, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fold_f32:
+; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
+; GCN-NOT: xor
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+  %fsub = fsub float -0.0, %in
+  %fmul = fmul float %fsub, %in
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
new file mode 100644
index 000000000000..4fac5176fac9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -0,0 +1,131 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_isinf_pattern:
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_0:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_not_isinf_pattern_1:
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
+  %ext = zext i1 %cmp to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_isfinite_pattern_0:
+; SI-NOT: v_cmp
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}}
+; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
+; SI-NOT: v_cmp
+; SI: s_endpgm
+define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Use negative infinity
+; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; No fabs
+; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %ninf = fcmp une float %x, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; fabs of different value
+; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong ordered compare type
+; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp uno float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Wrong unordered compare
+; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
+; SI-NOT: v_cmp_class_f32
+; SI: s_endpgm
+define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+  %ord = fcmp ord float %x, 0.000000e+00
+  %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
+  %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
+  %and = and i1 %ord, %ninf
+  %ext = zext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll
new file mode 100644
index 000000000000..5a79ca82bc29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp16_to_fp.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16, i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
new file mode 100644
index 000000000000..67925ebd82b6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float, float addrspace(1)* %in, align 4
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
new file mode 100644
index 000000000000..12df6606e8ff
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -0,0 +1,56 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @fp_to_sint_f64_i32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptosi <2 x double> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v4f64_v4i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptosi <4 x double> %in to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
+  %cast = fptosi double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
new file mode 100644
index 000000000000..301a94b4904c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -0,0 +1,230 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}fp_to_sint_i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: s_endpgm
+define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+  %conv = fptosi float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
+; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
+define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+  %in.fabs = call float @llvm.fabs.f32(float %in) #0
+  %conv = fptosi float %in.fabs to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptosi <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v4i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
+  %result = fptosi <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_i64:
+
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: s_endpgm
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptosi <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptosi <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
new file mode 100644
index 000000000000..41bc2a780014
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}fp_to_uint_i32_f64:
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+  %cast = fptoui double %in to i32
+  store i32 %cast, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i32_v2f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i32>
+  store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i32_v4f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i32>
+  store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_uint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %val = load double, double addrspace(1)* %gep, align 8
+  %cast = fptoui double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i64_v2f64
+define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i64>
+  store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i64_v4f64
+define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i64>
+  store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
new file mode 100644
index 000000000000..b7b6ccc238b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -0,0 +1,217 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptoui float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptoui <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+
+define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %value = load <4 x float>, <4 x float> addrspace(1) * %in
+  %result = fptoui <4 x float> %value to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_f32_to_i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+  %conv = fptoui float %x to i64
+  store i64 %conv, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptoui <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptoui <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
new file mode 100644
index 000000000000..734a43be2296
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fpext_f32_to_f64:
+; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+  %result = fpext float %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+  %result = fpext <2 x float> %in to <2 x double>
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+  %result = fpext <4 x float> %in to <4 x double>
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64:
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+; SI: v_cvt_f64_f32_e32
+define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+  %result = fpext <8 x float> %in to <8 x double>
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll
new file mode 100644
index 000000000000..385e10e7baae
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fptrunc.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
+; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+  %result = fptrunc double %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptrunc <2 x double> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptrunc <4 x double> %in to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32:
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+; SI: v_cvt_f32_f64_e32
+define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+  %result = fptrunc <8 x double> %in to <8 x float>
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
new file mode 100644
index 000000000000..f245ef08cb9d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}frem_f32:
+; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN-DAG: v_cmp
+; GCN-DAG: v_mul_f32
+; GCN: v_rcp_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_mul_f32_e32
+; GCN: v_trunc_f32_e32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                      float addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f32:
+; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
+; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
+define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                             float addrspace(1)* %in2) #1 {
+   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
+   %r0 = load float, float addrspace(1)* %in1, align 4
+   %r1 = load float, float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}frem_f64:
+; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0
+; GCN-DAG: v_div_fmas_f64
+; GCN-DAG: v_div_scale_f64
+; GCN-DAG: v_mul_f64
+; CI: v_trunc_f64_e32
+; CI: v_mul_f64
+; GCN: v_add_f64
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
+define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) #0 {
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f64:
+; GCN: v_rcp_f64_e32
+; GCN: v_mul_f64
+; SI: v_bfe_u32
+; CI: v_trunc_f64_e32
+; GCN: v_fma_f64
+; GCN: s_endpgm
+define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                             double addrspace(1)* %in2) #1 {
+   %r0 = load double, double addrspace(1)* %in1, align 8
+   %r1 = load double, double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                        <2 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
+   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
+   %r2 = frem <2 x float> %r0, %r1
+   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                        <4 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
+   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
+   %r2 = frem <4 x float> %r0, %r1
+   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
+   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
+   %r2 = frem <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
new file mode 100644
index 000000000000..04101346cdf9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
+
+; CHECK: {{^}}fsqrt_f32:
+; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
+
+define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+   %r0 = load float, float addrspace(1)* %in
+   %r1 = call float @llvm.sqrt.f32(float %r0)
+   store float %r1, float addrspace(1)* %out
+   ret void
+}
+
+; CHECK: {{^}}fsqrt_f64:
+; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r0 = load double, double addrspace(1)* %in
+   %r1 = call double @llvm.sqrt.f64(double %r0)
+   store double %r1, double addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.sqrt.f32(float %Val)
+declare double @llvm.sqrt.f64(double %Val)
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
new file mode 100644
index 000000000000..dfe41cb5b111
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}v_fsub_f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
+  %a = load float, float addrspace(1)* %in, align 4
+  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_fsub_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+  %sub = fsub float %a, %b
+  store float %sub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+; FUNC-LABEL: {{^}}fsub_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+
+; FIXME: Should be using SGPR directly for first operand
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+  %sub = fsub <2 x float> %a, %b
+  store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_fsub_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: Should be using SGPR directly for first operand
+
+; FUNC-LABEL: {{^}}s_fsub_v4f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: s_endpgm
+define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
new file mode 100644
index 000000000000..f34a48e30a86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.fabs.f64(double) #0
+
+; SI-LABEL: {{^}}fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r2 = fsub double %r0, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
+define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                           double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r1.fabs = call double @llvm.fabs.f64(double %r1) #0
+  %r2 = fsub double %r0, %r1.fabs
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_fabs_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                               double addrspace(1)* %in2) {
+  %r0 = load double, double addrspace(1)* %in1
+  %r1 = load double, double addrspace(1)* %in2
+  %r0.fabs = call double @llvm.fabs.f64(double %r0) #0
+  %r2 = fsub double %r0.fabs, %r1
+  store double %r2, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, %b
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double 4.0, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+  %sub = fsub double %a, 4.0
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_self_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+  %sub = fsub double %a, %a
+  store double %sub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v2f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+  %sub = fsub <2 x double> %a, %b
+  store <2 x double> %sub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
+  %a = load <4 x double>, <4 x double> addrspace(1)* %in
+  %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_fsub_v4f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+  %result = fsub <4 x double> %a, %b
+  store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
new file mode 100644
index 000000000000..6618d8b5e57e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}v_ftrunc_f64:
+; CI: v_trunc_f64
+; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: s_endpgm
+define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %x = load double, double addrspace(1)* %in, align 8
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_f64:
+; CI: v_trunc_f64_e32
+
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: s_endpgm
+define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64:
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
new file mode 100644
index 000000000000..edc08609a8aa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -0,0 +1,120 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+
+declare float @llvm.trunc.f32(float) nounwind readnone
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
+declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
+
+; FUNC-LABEL: {{^}}ftrunc_f32:
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+  %y = call float @llvm.trunc.f32(float %x) nounwind readnone
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f32:
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+  %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
+  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32:
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-EG: TRUNC
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
+;   store <3 x float> %y, <3 x float> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+  %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
+  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+  %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
+  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f32:
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; EG: TRUNC
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+  %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
+  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
new file mode 100644
index 000000000000..471b0f6b13e7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
+
+define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL: {{^}}use_gep_address_space:
+; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32
+; CI: s_add_i32
+; CHECK: ds_write_b32
+  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: {{^}}gep_as_vector_v4:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
+  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
+  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  store i32 99, i32 addrspace(3)* %p2
+  store i32 99, i32 addrspace(3)* %p3
+  ret void
+}
+
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: {{^}}gep_as_vector_v2:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+  %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll
new file mode 100644
index 000000000000..be775cf9292f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-directive.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure the GlobalDirective isn't merged with the function name
+
+; SI:	.globl	foo
+; SI: {{^}}foo:
+define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll
new file mode 100644
index 000000000000..bd9557d730fb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i1.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: Evergreen broken
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; SI: buffer_store_dwordx2
+define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i1_to_i64:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %a = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = zext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
+  %ext = sext <1 x i1> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = zext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
+  %ext = sext <2 x i1> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = zext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
+  %ext = sext <4 x i1> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = zext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
+  %ext = sext <8 x i1> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = zext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
+  %ext = sext <16 x i1> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = zext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
+;   %ext = sext <32 x i1> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = zext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
+;   %ext = sext <64 x i1> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
new file mode 100644
index 000000000000..103a40dee270
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -0,0 +1,302 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
+; SI: buffer_load_ushort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
+; SI: buffer_load_sshort
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
+; SI: buffer_load_ushort
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
+; SI: buffer_load_sshort
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i32>
+  store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i32>
+  store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
+; SI: buffer_load_ushort v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = zext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
+; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %a = load i16, i16 addrspace(1)* %in
+  %ext = sext i16 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = zext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
+  %ext = sext <1 x i16> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = zext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %ext = sext <2 x i16> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = zext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %ext = sext <4 x i16> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = zext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
+  %ext = sext <8 x i16> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = zext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
+  %ext = sext <16 x i16> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = zext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
+; SI: s_endpgm
+define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
+  %ext = sext <32 x i16> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = zext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
+; SI: s_endpgm
+define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+  %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
+  %ext = sext <64 x i16> %load to <64 x i64>
+  store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
new file mode 100644
index 000000000000..79b83452939e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i32.ll
@@ -0,0 +1,457 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
+; SI: buffer_load_dword v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32, i32 addrspace(1)* %in
+  %ext = zext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %a = load i32, i32 addrspace(1)* %in
+  %ext = sext i32 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = zext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64:
+; SI: buffer_load_dword
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i32>, <1 x i32> addrspace(1)* %in
+  %ext = sext <1 x i32> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = zext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64:
+; SI: buffer_load_dwordx2
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %ext = sext <2 x i32> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = zext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64:
+; SI: buffer_load_dwordx4
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %ext = sext <4 x i32> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = zext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %ext = sext <8 x i32> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = sext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %ext = zext <16 x i32> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+; SI-DAG: v_ashrrev_i32
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = sext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dwordx2
+
+; SI: s_endpgm
+define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind {
+  %load = load <32 x i32>, <32 x i32> addrspace(1)* %in
+  %ext = zext <32 x i32> %load to <32 x i64>
+  store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll
new file mode 100644
index 000000000000..b31d5361d5a2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-extload-i8.ll
@@ -0,0 +1,299 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i32:
+; SI: buffer_load_sbyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i32>
+  store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i32>
+  store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i32>
+  store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i32>
+;   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i32>
+;   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
+; SI: buffer_load_ubyte v[[LO:[0-9]+]],
+; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
+define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
+define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %a = load i8, i8 addrspace(1)* %in
+  %ext = sext i8 %a to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = zext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64:
+; SI: s_endpgm
+define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
+  %ext = sext <1 x i8> %load to <1 x i64>
+  store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = zext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64:
+; SI: s_endpgm
+define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %ext = sext <2 x i8> %load to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = zext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64:
+; SI: s_endpgm
+define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %ext = sext <4 x i8> %load to <4 x i64>
+  store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = zext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64:
+; SI: s_endpgm
+define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
+  %ext = sext <8 x i8> %load to <8 x i64>
+  store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = zext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64:
+; SI: s_endpgm
+define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind {
+  %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
+  %ext = sext <16 x i8> %load to <16 x i64>
+  store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = zext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64:
+; XSI: s_endpgm
+; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
+;   %ext = sext <32 x i8> %load to <32 x i64>
+;   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = zext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
+
+; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64:
+; XSI: s_endpgm
+; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind {
+;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
+;   %ext = sext <64 x i8> %load to <64 x i64>
+;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/global-zero-initializer.ll b/test/CodeGen/AMDGPU/global-zero-initializer.ll
new file mode 100644
index 000000000000..45aa8bf4e1d7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global-zero-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_global_global
+
+@lds = addrspace(1) global [256 x i32] zeroinitializer
+
+define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(1)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
new file mode 100644
index 000000000000..847950f6376e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -0,0 +1,801 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_i32_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
new file mode 100644
index 000000000000..014b0a5482ab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
+
+; FUNC-LABEL: {{^}}test_i8:
+; EG: CF_END
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8, i8 addrspace(2)* %arrayidx, align 1
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+; FUNC-LABEL: {{^}}test_i16:
+; EG: CF_END
+; SI: buffer_store_short
+; SI: s_endpgm
+define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16, i16 addrspace(2)* %arrayidx, align 2
+  store i16 %1, i16 addrspace(1)* %out
+  ret void
+}
+
+%struct.bar = type { float, [5 x i8] }
+
+; The illegal i8s aren't handled
+@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+
+; FUNC-LABEL: {{^}}struct_bar_gv_load:
+define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8, i8 addrspace(2)* %gep, align 1
+  store i8 %load, i8 addrspace(1)* %out, align 1
+  ret void
+}
+
+
+; The private load isn't scalarzied.
+@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
+                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
+                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
+
+; FUNC-LABEL: {{^}}array_vector_gv_load:
+define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
new file mode 100644
index 000000000000..3c1fc6c98f74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -0,0 +1,101 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+
+; FUNC-LABEL: {{^}}float:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
+
+define void @float(float addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %1 = load float, float addrspace(2)* %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+
+; FUNC-LABEL: {{^}}i32:
+
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
+
+define void @i32(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+
+%struct.foo = type { float, [5 x i32] }
+
+@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+
+; FUNC-LABEL: {{^}}struct_foo_gv_load:
+; GCN: s_load_dword
+
+define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32, i32 addrspace(2)* %gep, align 4
+  store i32 %load, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+                                                                <1 x i32> <i32 2>,
+                                                                <1 x i32> <i32 3>,
+                                                                <1 x i32> <i32 4> ]
+
+; FUNC-LABEL: {{^}}array_v1_gv_load:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
+; VI: s_load_dword
+define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
+  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+entry:
+  %0 = icmp eq i32 0, %a
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %2 = load float, float addrspace(2)* %1
+  store float %2, float addrspace(1)* %out
+  br label %endif
+
+else:
+  store float 1.0, float addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
new file mode 100644
index 000000000000..bf8f11860b50
--- /dev/null
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -0,0 +1,525 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; half args should be promoted to float
+
+; GCN-LABEL: {{^}}load_f16_arg:
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
+; GCN: buffer_store_short [[CVT]]
+define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+  store half %arg, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v2f16_arg:
+; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
+; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
+; GCN: s_endpgm
+define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+  store <2 x half> %arg, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v3f16_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN-NOT: buffer_load
+; GCN-DAG: buffer_store_dword
+; GCN-DAG: buffer_store_short
+; GCN-NOT: buffer_store
+; GCN: s_endpgm
+define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+  store <3 x half> %arg, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v4f16_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+  store <4 x half> %arg, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_v8f16_arg:
+define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+  store <8 x half> %arg, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v2f16_arg:
+define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fpext = fpext <2 x half> %in to <2 x float>
+  store <2 x float> %fpext, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
+define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+  %ext = fpext half %arg to float
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
+define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+  %ext = fpext <2 x half> %arg to <2 x float>
+  store <2 x float> %ext, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN-NOT: buffer_load
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN-NOT: v_cvt_f32_f16
+; GCN-DAG: buffer_store_dword
+; GCN-DAG: buffer_store_dwordx2
+; GCN: s_endpgm
+define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+  %ext = fpext <3 x half> %arg to <3 x float>
+  store <3 x float> %ext, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
+define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+  %ext = fpext <4 x half> %arg to <4 x float>
+  store <4 x float> %ext, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
+define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+  %ext = fpext <8 x half> %arg to <8 x float>
+  store <8 x float> %ext, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
+define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+  %ext = fpext half %arg to double
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
+define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+  %ext = fpext <2 x half> %arg to <2 x double>
+  store <2 x double> %ext, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
+define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+  %ext = fpext <3 x half> %arg to <3 x double>
+  store <3 x double> %ext, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
+define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+  %ext = fpext <4 x half> %arg to <4 x double>
+  store <4 x double> %ext, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
+define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+  %ext = fpext <8 x half> %arg to <8 x double>
+  store <8 x double> %ext, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_f16:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  store half %val, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v2f16:
+; GCN: buffer_load_dword [[TMP:v[0-9]+]]
+; GCN: buffer_store_dword [[TMP]]
+define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v4f16:
+; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx2 [[TMP]]
+define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  store <4 x half> %val, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_load_store_v8f16:
+; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
+; GCN: s_endpgm
+define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  store <8 x half> %val, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_f16_to_f32:
+; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
+; GCN: buffer_store_dword [[CVT]]
+define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  %cvt = fpext half %val to float
+  store float %cvt, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
+define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %cvt = fpext <2 x half> %val to <2 x float>
+  store <2 x float> %cvt, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
+define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+  %val = load <3 x half>, <3 x half> addrspace(1)* %in
+  %cvt = fpext <3 x half> %val to <3 x float>
+  store <3 x float> %cvt, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
+define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  %cvt = fpext <4 x half> %val to <4 x float>
+  store <4 x float> %cvt, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
+define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  %cvt = fpext <8 x half> %val to <8 x float>
+  store <8 x float> %cvt, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
+define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+  %val = load <16 x half>, <16 x half> addrspace(1)* %in
+  %cvt = fpext <16 x half> %val to <16 x float>
+  store <16 x float> %cvt, <16 x float> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_f16_to_f64:
+; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
+; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
+; GCN: buffer_store_dwordx2 [[CVT1]]
+define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %val = load half, half addrspace(1)* %in
+  %cvt = fpext half %val to double
+  store double %cvt, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
+define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %cvt = fpext <2 x half> %val to <2 x double>
+  store <2 x double> %cvt, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
+define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+  %val = load <3 x half>, <3 x half> addrspace(1)* %in
+  %cvt = fpext <3 x half> %val to <3 x double>
+  store <3 x double> %cvt, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
+define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %val = load <4 x half>, <4 x half> addrspace(1)* %in
+  %cvt = fpext <4 x half> %val to <4 x double>
+  store <4 x double> %cvt, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
+define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+  %val = load <8 x half>, <8 x half> addrspace(1)* %in
+  %cvt = fpext <8 x half> %val to <8 x double>
+  store <8 x double> %cvt, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
+define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+  %val = load <16 x half>, <16 x half> addrspace(1)* %in
+  %cvt = fpext <16 x half> %val to <16 x double>
+  store <16 x double> %cvt, <16 x double> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_f32_to_f16:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
+; GCN: buffer_store_short [[CVT]]
+define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %val = load float, float addrspace(1)* %in
+  %cvt = fptrunc float %val to half
+  store half %cvt, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
+; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
+; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
+; GCN-DAG: buffer_store_short [[CVT0]]
+; GCN-DAG: buffer_store_short [[CVT1]]
+; GCN: s_endpgm
+define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+  %val = load <2 x float>, <2 x float> addrspace(1)* %in
+  %cvt = fptrunc <2 x float> %val to <2 x half>
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Shouldn't do 4th conversion
+; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
+; GCN: buffer_load_dwordx4
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+  %val = load <3 x float>, <3 x float> addrspace(1)* %in
+  %cvt = fptrunc <3 x float> %val to <3 x half>
+  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
+; GCN: buffer_load_dwordx4
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+  %val = load <4 x float>, <4 x float> addrspace(1)* %in
+  %cvt = fptrunc <4 x float> %val to <4 x half>
+  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+  %val = load <8 x float>, <8 x float> addrspace(1)* %in
+  %cvt = fptrunc <8 x float> %val to <8 x half>
+  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: v_cvt_f16_f32_e32
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+  %val = load <16 x float>, <16 x float> addrspace(1)* %in
+  %cvt = fptrunc <16 x float> %val to <16 x half>
+  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Unsafe math should fold conversions away
+; GCN-LABEL: {{^}}fadd_f16:
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+   %add = fadd half %a, %b
+   store half %add, half addrspace(1)* %out, align 4
+   ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v2f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+  %add = fadd <2 x half> %a, %b
+  store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v4f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
+  %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
+  %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
+  %result = fadd <4 x half> %a, %b
+  store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_v8f16:
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; GCN: s_endpgm
+define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+  %add = fadd <8 x half> %a, %b
+  store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
+  ret void
+}
+
+; GCN-LABEL: {{^}}fsub_f16:
+; GCN: v_subrev_f32_e32
+; GCN: s_endpgm
+define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
+  %a = load half, half addrspace(1)* %in
+  %b = load half, half addrspace(1)* %b_ptr
+  %sub = fsub half %a, %b
+  store half %sub, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_from_half:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+  %val = load half, half addrspace(1)* %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_to_half:
+; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
+; GCN: buffer_store_short [[TMP]]
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %val = load i16, i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll
new file mode 100644
index 000000000000..f9113399afe8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
+
+; HSA: .section        .hsa.version
+; HSA-NEXT: .ascii  "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0"
+; HSA: {{^}}simple:
+; Make sure we are setting the ATC bit:
+; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000
+; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0
+
+define void @simple(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
new file mode 100644
index 000000000000..b11a21137642
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SILowerI1Copies was not handling IMPLICIT_DEF
+; SI-LABEL: {{^}}br_implicit_def:
+; SI: BB#0:
+; SI-NEXT: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: BB#1:
+define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 123, i32 addrspace(1)* %out
+  ret void
+
+bb2:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
new file mode 100644
index 000000000000..105cd06b330a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}br_i1_phi:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: v_mov_b32_e32 [[REG]], -1{{$}}
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]]
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+bb:
+  br i1 %arg1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
+  br i1 %tmp, label %bb4, label %bb6
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = mul i32 undef, %arg
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb3
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
new file mode 100644
index 000000000000..c218e1918bb0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8, i8 addrspace(1)* %in
+  %2 = uitofp i8 %1 to double
+  %3 = fptrunc double %2 to float
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
new file mode 100644
index 000000000000..60e59a5a5286
--- /dev/null
+++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Test that a select with reversed True/False values is correctly lowered
+;to a SETNE_INT.  There should only be one SETNE_INT instruction.
+
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NOT: SETNE_INT
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx1
+  %cmp = icmp eq i32 %0, %1
+  %value = select i1 %cmp, i32 0, i32 -1
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll
new file mode 100644
index 000000000000..0eaa33ebafed
--- /dev/null
+++ b/test/CodeGen/AMDGPU/icmp64.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}test_i64_eq:
+; SI: v_cmp_eq_i64
+define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp eq i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ne:
+; SI: v_cmp_ne_i64
+define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ne i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_slt:
+; SI: v_cmp_lt_i64
+define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp slt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ult:
+; SI: v_cmp_lt_u64
+define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sle:
+; SI: v_cmp_le_i64
+define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sle i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ule:
+; SI: v_cmp_le_u64
+define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ule i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sgt:
+; SI: v_cmp_gt_i64
+define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sgt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_ugt:
+; SI: v_cmp_gt_u64
+define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_sge:
+; SI: v_cmp_ge_i64
+define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp sge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_i64_uge:
+; SI: v_cmp_ge_u64
+define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %cmp = icmp uge i64 %a, %b
+  %result = sext i1 %cmp to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
new file mode 100644
index 000000000000..12eed550eb1f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -0,0 +1,617 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s
+
+; Use a 64-bit value with lo bits that can be represented as an inline constant
+; CHECK-LABEL: {{^}}i64_imm_inline_lo:
+; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
+define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+entry:
+  store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
+  ret void
+}
+
+; Use a 64-bit value with hi bits that can be represented as an inline constant
+; CHECK-LABEL: {{^}}i64_imm_inline_hi:
+; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
+define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+entry:
+  store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+  store i64 -9223372036854775808, i64 addrspace(1) *%out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+  store i32 -2147483648, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+  store float 0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+  store float -0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+  store float 0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+  store float -0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+  store float 1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+  store float -1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+  store float 2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+  store float -2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+  store float -4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
+; CHECK: buffer_store_dword [[REG]]
+define void @store_literal_imm_f32(float addrspace(1)* %out) {
+  store float 4096.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float, float addrspace(1)* %in
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}commute_add_literal_f32:
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float, float addrspace(1)* %in
+  %y = fadd float %x, 1024.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36a0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36b0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36e0000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffe0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xffffffffc0000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0xfffffffe00000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x36ff800000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f32:
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]]
+; CHECK: buffer_store_dword [[REG]]
+define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0x3700000000000000
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -0.5
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -1.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -2.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, -4.0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}add_inline_imm_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000001
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000002
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000010
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xffffffffffffffff
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffffe
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0xfffffffffffffff0
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_63_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x000000000000003F
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_64_f64:
+; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]]
+; CHECK: buffer_store_dwordx2 [[REG]]
+define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+  %y = fadd double %x, 0x0000000000000040
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64:
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+  store double 0.0, double addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+  store double -0.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+  store double 0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+  store double -0.5, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+  store double 1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+  store double -1.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+  store double 2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+  store double -2.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+  store double 4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64:
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+  store double -4.0, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f64:
+; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
+; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_literal_imm_f64(double addrspace(1)* %out) {
+  store double 4096.0, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
new file mode 100644
index 000000000000..f551606d63a7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; Tests for indirect addressing on SI, which is implemented using dynamic
+; indexing of vectors.
+
+; CHECK-LABEL: {{^}}extract_w_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
+define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_wo_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
+define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_neg_offset_sgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0
+define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}extract_neg_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
+; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0
+; CHECK: s_cbranch_execnz
+define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -512
+  %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_w_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
+define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
+  %2 = extractelement <4 x float> %1, i32 2
+  store float %2, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_wo_offset:
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
+define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
+  %1 = extractelement <4 x float> %0, i32 2
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_offset_sgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
+; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}}
+define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+entry:
+  %index = add i32 %offset, -512
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}}
+; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
+; CHECK: s_cbranch_execnz
+define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -512
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr:
+; The offset depends on the register that holds the first element of the vector.
+; CHECK: v_readfirstlane_b32
+; CHECK: s_add_i32 m0, m0, -{{[0-9]+}}
+; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}}
+; CHECK: s_cbranch_execnz
+define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %id = call i32 @llvm.r600.read.tidig.x() #1
+  %index = add i32 %id, -16
+  %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
+  store <4 x i32> %value, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
new file mode 100644
index 000000000000..d63e1b6c5212
--- /dev/null
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
+
+declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+
+; SI-LABEL: {{^}}private_access_f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load double, double addrspace(1)* %in, align 8
+  %array = alloca double, i32 16, align 8
+  %ptr = getelementptr double, double* %array, i32 %b
+  store double %val, double* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load double, double* %ptr, align 8
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_v2f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
+  %array = alloca <2 x double>, i32 16, align 16
+  %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
+  store <2 x double> %val, <2 x double>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x double>, <2 x double>* %ptr, align 16
+  store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
+define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load i64, i64 addrspace(1)* %in, align 8
+  %array = alloca i64, i32 16, align 8
+  %ptr = getelementptr i64, i64* %array, i32 %b
+  store i64 %val, i64* %ptr, align 8
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load i64, i64* %ptr, align 8
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}private_access_v2i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
+  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
+  %array = alloca <2 x i64>, i32 16, align 16
+  %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
+  store <2 x i64> %val, <2 x i64>* %ptr, align 16
+  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  %result = load <2 x i64>, <2 x i64>* %ptr, align 16
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
new file mode 100644
index 000000000000..f6e39b3d8306
--- /dev/null
+++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -0,0 +1,10 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+define void @inf_loop_irreducible_cfg() nounwind {
+entry:
+  br label %block
+
+block:
+  br label %block
+}
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
new file mode 100644
index 000000000000..7233aa57fd78
--- /dev/null
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}infinite_loop:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI: BB0_1:
+; SI: buffer_store_dword [[REG]]
+; SI: s_waitcnt vmcnt(0) expcnt(0)
+; SI: s_branch BB0_1
+define void @infinite_loop(i32 addrspace(1)* %out) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+}
+
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
new file mode 100644
index 000000000000..efc2292de3a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}inline_asm:
+; CHECK: s_endpgm
+; CHECK: s_endpgm
+define void @inline_asm(i32 addrspace(1)* %out) {
+entry:
+  store i32 5, i32 addrspace(1)* %out
+  call void asm sideeffect "s_endpgm", ""()
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll
new file mode 100644
index 000000000000..33a4c832e75e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inline-calls.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: {{^}}func:
+define internal fastcc i32 @func(i32 %a) {
+entry:
+  %tmp0 = add i32 %a, 1
+  ret i32 %tmp0
+}
+
+; CHECK: {{^}}kernel:
+define void @kernel(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @func(i32 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}kernel2:
+define void @kernel2(i32 addrspace(1)* %out) {
+entry:
+  call void @kernel(i32 addrspace(1)* %out)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/input-mods.ll b/test/CodeGen/AMDGPU/input-mods.ll
new file mode 100644
index 000000000000..1c4d285cbcb1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/input-mods.ll
@@ -0,0 +1,26 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM
+
+;EG-LABEL: {{^}}test:
+;EG: EXP_IEEE *
+;CM-LABEL: {{^}}test:
+;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|
+;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X|
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = call float @llvm.fabs.f32(float %r0)
+   %r2 = fsub float -0.000000e+00, %r1
+   %r3 = call float @llvm.exp2.f32(float %r2)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll
new file mode 100644
index 000000000000..4a5e8869c2df
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that INSERT_SUBREG instructions don't have non-register operands after
+; instruction selection.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: test:
+define void @test(i64 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca [16 x i32]
+  %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
+  %tmp2 = sext i32 %tmp1 to i64
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
new file mode 100644
index 000000000000..6de3d408c486
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -0,0 +1,252 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; FIXME: Broken on evergreen
+; FIXME: For some reason the 8 and 16 vectors are being stored as
+; individual elements instead of 128-bit stores.
+
+
+; FIXME: Why is the constant moved into the intermediate register and
+; not just directly into the vector component?
+
+; SI-LABEL: {{^}}insertelement_v4f32_0:
+; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
+; v_mov_b32_e32
+; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
+; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
+; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
+define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_1:
+define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_2:
+define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4f32_3:
+define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}insertelement_v4i32_0:
+define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
+define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
+  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
+  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
+  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
+; SI: buffer_store_dwordx2
+define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
+; SI: buffer_store_dwordx4
+define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
+  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
+  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
+  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
+; FIXMESI: buffer_store_dwordx2
+define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
+  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
+; FIXMESI: BUFFER_STORE_USHORT
+define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
+  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
+; FIXMESI: buffer_store_dword
+define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
+  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
+; FIXMESI: buffer_store_dwordx2
+define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
+  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
+; FIXMESI: buffer_store_dwordx4
+define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
+  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
+  ret void
+}
+
+; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
+; the compiler doesn't crash.
+; SI-LABEL: {{^}}insert_split_bb:
+define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+entry:
+  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %if, label %else
+
+if:
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
+  br label %endif
+
+else:
+  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %5 = load i32, i32 addrspace(1)* %4
+  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
+  br label %endif
+
+endif:
+  %7 = phi <2 x i32> [%3, %if], [%6, %else]
+  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
+  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
+  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
+  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
+  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/jump-address.ll b/test/CodeGen/AMDGPU/jump-address.ll
new file mode 100644
index 000000000000..f55912e37401
--- /dev/null
+++ b/test/CodeGen/AMDGPU/jump-address.ll
@@ -0,0 +1,52 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: JUMP @6
+; CHECK: EXPORT
+; CHECK-NOT: EXPORT
+
+define void @main() #0 {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  %4 = sext i1 %3 to i32
+  %5 = bitcast i32 %4 to float
+  %6 = bitcast float %5 to i32
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = bitcast float %9 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF13, label %ENDIF
+
+ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
+  %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp2.0, i32 2
+  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  ret void
+
+IF13:                                             ; preds = %ELSE
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fadd float 0xFFF8000000000000, %22
+  br label %ENDIF
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
new file mode 100644
index 000000000000..7e2291cfdc35
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -0,0 +1,100 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main1:
+; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
+define void @main1() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fcmp ogt float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %11 = extractelement <4 x float> %10, i32 1
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ogt float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 2
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ogt float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fcmp ogt float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
+; CHECK: {{^}}main2:
+; CHECK-NOT: MOV
+define void @main2() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = fcmp ogt float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ogt float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float>, <4 x float> addrspace(8)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 3
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ogt float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float>, <4 x float> addrspace(8)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 2
+  %30 = fcmp ogt float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
new file mode 100644
index 000000000000..1dd7c2cb7995
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -0,0 +1,473 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}i8_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ubyte
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+entry:
+  %0 = sext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; GCN: buffer_load_ushort
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_zext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_sext_arg:
+; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+
+define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+entry:
+  %0 = sext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+entry:
+  store i32 %in, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_arg:
+; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
+define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+entry:
+  store float %in, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+entry:
+  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN-DAG: buffer_load_ushort
+; GCN-DAG: buffer_load_ushort
+define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
+define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v2f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
+define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_arg:
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
+define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i16_arg:
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
+define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+entry:
+  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+; FUNC-LABEL: {{^}}v3i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
+define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+entry:
+  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
+define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+entry:
+  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+entry:
+  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
+define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v4f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
+define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+entry:
+  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+entry:
+  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+entry:
+  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v8f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+entry:
+  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i8_arg:
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; EG: VTX_READ_8
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+entry:
+  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i16_arg:
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; EG: VTX_READ_16
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+entry:
+  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16i32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+entry:
+  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v16f32_arg:
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+entry:
+  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}kernel_arg_i64:
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: buffer_store_dwordx2
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_kernel_arg:
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
+; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
+; GCN: buffer_store_dwordx2
+define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+entry:
+  store double %in, double addrspace(1)* %out
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
+; XGCN: s_load_dwordx2
+; XGCN: s_load_dwordx2
+; XGCN: buffer_store_dwordx2
+; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll
new file mode 100644
index 000000000000..671833d1a33a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-alloca.ll
@@ -0,0 +1,15 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
+
+define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %0 = load i32, i32* %gep1
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll
new file mode 100644
index 000000000000..9975b1b7f5cc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s
+; CHECK: s_endpgm
+
+@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
+
+define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
+  %mul12 = mul nsw i32 %val, 7
+  br i1 undef, label %exit, label %bb
+
+bb:
+  %cmp = icmp slt i32 %x, 0
+  br label %exit
+
+exit:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
new file mode 100644
index 000000000000..bf8df63be9fd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_lds_global
+
+@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
+
+define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
new file mode 100644
index 000000000000..6ff6fc3d7afc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+
+; The test is for a bug in R600EmitClauseMarkers.cpp where this pass
+; was searching for a use of the OQAP register in order to determine
+; if an LDS instruction could fit in the current clause, but never finding
+; one.  This created an infinite loop and hung the compiler.
+;
+; The LDS instruction should not have been defining OQAP in the first place,
+; because the LDS instructions are pseudo instructions and the OQAP
+; reads and writes are bundled together in the same instruction.
+
+; CHECK: {{^}}lds_crash:
+define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  ; This block needs to be > 115 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %0, %b
+  %div1 = udiv i32 %div0, %a
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  store i32 %div7, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
new file mode 100644
index 000000000000..44ffc36af149
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: {{^}}lds_input_queue:
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32, i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+; The machine scheduler does not do proper alias analysis and assumes that
+; loads from global values (Note that a global value is different that a
+; value from global memory.  A global value is a value that is declared
+; outside of a function, it can reside in any address space) alias with
+; all other loads.
+;
+; This is a problem for scheduling the reads from the local data share (lds).
+; These reads are implemented using two instructions.  The first copies the
+; data from lds into the lds output queue, and the second moves the data from
+; the input queue into main memory.  These two instructions don't have to be
+; scheduled one after the other, but they do need to be scheduled in the same
+; clause.  The aliasing problem mentioned above causes problems when there is a
+; load from global memory which immediately follows a load from a global value that
+; has been declared in the local memory space:
+;
+;  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+;  %1 = load i32, i32 addrspace(3)* %0
+;  %2 = load i32, i32 addrspace(1)* %in
+;
+; The instruction selection phase will generate ISA that looks like this:
+; %OQAP = LDS_READ_RET
+; %vreg0 = MOV %OQAP
+; %vreg1 = VTX_READ_32
+; %vreg2 = ADD_INT %vreg1, %vreg0
+;
+; The bottom scheduler will schedule the two ALU instructions first:
+;
+; UNSCHEDULED:
+; %OQAP = LDS_READ_RET
+; %vreg1 = VTX_READ_32
+;
+; SCHEDULED:
+;
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
+; to consider the global memory read (VTX_READ_32) has a chain dependency, so
+; the global memory read will always be scheduled first.  This will give us a
+; final program which looks like this:
+;
+; Alu clause:
+; %OQAP = LDS_READ_RET
+; VTX clause:
+; %vreg1 = VTX_READ_32
+; Alu clause:
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; This is an illegal program because the OQAP def and use know occur in
+; different ALU clauses.
+;
+; This test checks this scenario and makes sure it doesn't result in an
+; illegal program.  For now, we have fixed this issue by merging the
+; LDS_READ_RET and MOV together during instruction selection and then
+; expanding them after scheduling.  Once the scheduler has better alias
+; analysis, we should be able to keep these instructions sparate before
+; scheduling.
+;
+; CHECK-LABEL: {{^}}local_global_alias:
+; CHECK: LDS_READ_RET
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
+  %1 = load i32, i32 addrspace(3)* %0
+  %2 = load i32, i32 addrspace(1)* %in
+  %3 = add i32 %2, %1
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
new file mode 100644
index 000000000000..3e8328659fdb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test makes sure we do not double count global values when they are
+; used in different basic blocks.
+
+; CHECK: .long   166120
+; CHECK-NEXT: .long   1
+; CHECK-LABEL: {{^}}test:
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
+
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  store i32 1, i32 addrspace(3)* @lds
+  br label %endif
+
+else:
+  store i32 2, i32 addrspace(3)* @lds
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
new file mode 100644
index 000000000000..fb51bc0e50c2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s
+; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+
+@lds = addrspace(3) global [256 x i32] zeroinitializer
+
+define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32, i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
new file mode 100644
index 000000000000..4244c48d240e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests a bug where LegalizeDAG was not checking the target's
+; BooleanContents value and always using one for true, when expanding
+; setcc to select_cc.
+;
+; This bug caused the icmp IR instruction to be expanded to two machine
+; instructions, when only one is needed.
+;
+
+; CHECK: {{^}}setcc_expand:
+; CHECK: SET
+; CHECK-NOT: CND
+define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp eq i32 %in, 5
+  br i1 %0, label %IF, label %ENDIF
+IF:
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lit.local.cfg b/test/CodeGen/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
new file mode 100644
index 000000000000..cff1c24f89d6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Test using an integer literal constant.
+; Generated ASM should be:
+; ADD_INT KC0[2].Z literal.x, 5
+; or
+; ADD_INT literal.x KC0[2].Z, 5
+
+; CHECK: {{^}}i32_literal:
+; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 5, %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test using a float literal constant.
+; Generated ASM should be:
+; ADD KC0[2].Z literal.x, 5.0
+; or
+; ADD literal.x KC0[2].Z, 5.0
+
+; CHECK: {{^}}float_literal:
+; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.0
+define void @float_literal(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fadd float 5.0, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure inline literals are folded into REG_SEQUENCE instructions.
+; CHECK: {{^}}inline_literal_reg_sequence:
+; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
+
+define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}inline_literal_dot4:
+; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
+; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
+define void @inline_literal_dot4(float addrspace(1)* %out) {
+entry:
+  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
new file mode 100644
index 000000000000..8bf094b8bc7b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone
+
+; Legacy name
+declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_abs_i32:
+; SI: s_sub_i32
+; SI: s_max_i32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind {
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32, i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
+
+; EG: SUB_INT
+; EG: MAX_INT
+define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %val = load i32, i32 addrspace(1)* %src, align 4
+  %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone
+  store i32 %abs, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
new file mode 100644
index 000000000000..db883972d646
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_global:
+; EG: GROUP_BARRIER
+; SI: buffer_store_dword
+; SI: s_waitcnt
+; SI: s_barrier
+
+define void @test_barrier_global(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.global()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.global()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
new file mode 100644
index 000000000000..48fb2e0b1a8d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_local:
+; EG: GROUP_BARRIER
+
+; SI: buffer_store_dword
+; SI: s_waitcnt
+; SI: s_barrier
+
+define void @test_barrier_local(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.local()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
+  %6 = load i32, i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
new file mode 100644
index 000000000000..1168713ca66e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -0,0 +1,437 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
+define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; SI: v_bfe_i32
+; EG: BFE_INT
+define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_bfe_print_arg:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+  %load = load i32, i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
+define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_7:
+; SI-NOT: shl
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_8:
+; SI: buffer_load_dword
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_13:
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
+; SI: buffer_store_dword [[BFE]],
+define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @simplify_demanded_bfe_sdiv
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
+; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; SI: buffer_store_dword [[TMP2]]
+define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
new file mode 100644
index 000000000000..541119242a94
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
@@ -0,0 +1,627 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; SI: v_bfe_u32
+; EG: BFE_UINT
+define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
+; SI: buffer_load_ubyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: bfe
+; SI: s_endpgm
+define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_1:
+; SI: buffer_load_dword
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: s_endpgm
+; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
+define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_4:
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_5:
+; SI: buffer_load_dword
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
+define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_7:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_13:
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; SI: buffer_load_dword [[ARG:v[0-9]+]]
+; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; SI-DAG: buffer_store_dword [[AND]]
+; SI-DAG: buffer_store_dword [[BFE]]
+; SI: s_endpgm
+define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) nounwind {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_and:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = lshr i32 %a, 6
+  %c = and i32 %b, 7
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_lshr_and:
+; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
+; SI: buffer_store_dword
+define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = lshr i32 %a, %b
+  %d = and i32 %c, 7
+  store i32 %d, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 448
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}and_lshr2:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; SI: buffer_store_dword
+define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = and i32 %a, 511
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_lshr:
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
+; SI: buffer_store_dword
+define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %b = shl i32 %a, 9
+  %c = lshr i32 %b, 11
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
new file mode 100644
index 000000000000..517a55abc098
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
+; SI: v_bfi_b32
+; EG: BFI_INT
+define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+  %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %bfi, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
new file mode 100644
index 000000000000..50492289d744
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfm_arg_arg:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; EG: BFM_INT
+define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_arg_imm:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b
+; EG: BFM_INT
+define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_imm_arg:
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}}
+; EG: BFM_INT
+define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_imm_imm:
+; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8
+; EG: BFM_INT
+define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
+  %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone
+  store i32 %bfm, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  %c = shl i32 %b, %y
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfm_pattern_simple:
+; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
+define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) {
+  %a = shl i32 1, %x
+  %b = sub i32 %a, 1
+  store i32 %b, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
new file mode 100644
index 000000000000..301de4b1c82d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_brev_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_brev_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
new file mode 100644
index 000000000000..11ec963ab314
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
+declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}clamp_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+
+; EG: MOV_SAT
+define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fneg = fsub float -0.0, %src
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %src.fneg.fabs = fsub float -0.0, %src.fabs
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
new file mode 100644
index 000000000000..805a88b59c72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll
@@ -0,0 +1,497 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i1 @llvm.AMDGPU.class.f32(float, i32) #1
+declare i1 @llvm.AMDGPU.class.f64(double, i32) #1
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare double @llvm.fabs.f64(double) #1
+
+; SI-LABEL: {{^}}test_class_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fneg = fsub float -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+  %a.fabs = call float @llvm.fabs.f32(float %a) #1
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 10 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_9bit_mask_f32:
+; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f32:
+; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; FIXME: Why isn't this using a literal constant operand?
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
+; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
+; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fneg = fsub double -0.0, %a
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
+; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+  %a.fabs = call double @llvm.fabs.f64(double %a) #1
+  %a.fneg.fabs = fsub double -0.0, %a.fabs
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_1_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
+; SI: s_endpgm
+define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_64_f64:
+; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
+; SI: s_endpgm
+define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Set all 9 bits of mask
+; SI-LABEL: {{^}}test_class_full_mask_f64:
+; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI-NEXT: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}v_test_class_full_mask_f64:
+; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
+; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]]
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
+; XSI: v_cmp_class_f64_e32 vcc, 1.0,
+; SI: v_cmp_class_f64_e32 vcc,
+; SI: s_endpgm
+define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
+; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: s_endpgm
+define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %b = load i32, i32 addrspace(1)* %gep.in
+
+  %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+
+  %sext = sext i1 %or.1 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
+; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1
+  %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1
+  %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1
+  %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1
+  %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1
+  %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1
+  %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1
+  %or.0 = or i1 %class0, %class1
+  %or.1 = or i1 %or.0, %class2
+  %or.2 = or i1 %or.1, %class3
+  %or.3 = or i1 %or.2, %class4
+  %or.4 = or i1 %or.3, %class5
+  %or.5 = or i1 %or.4, %class6
+  %or.6 = or i1 %or.5, %class7
+  %or.7 = or i1 %or.6, %class8
+  %or.8 = or i1 %or.7, %class9
+  %sext = sext i1 %or.8 to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_1:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_fold_or_class_f32_2:
+; SI-NOT: v_cmp_class
+; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
+; SI-NOT: v_cmp_class
+; SI: s_endpgm
+define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
+; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
+; SI: s_or_b64
+; SI: s_endpgm
+define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.in
+
+  %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1
+  %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1
+  %or = or i1 %class0, %class1
+
+  %sext = sext i1 %or to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f32:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_class_0_f64:
+; SI-NOT: v_cmp_class
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+  %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1
+  %sext = sext i1 %result to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
new file mode 100644
index 000000000000..e95a51093cb7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
@@ -0,0 +1,59 @@
+
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}cube:
+; CHECK: CUBE T{{[0-9]}}.X
+; CHECK: CUBE T{{[0-9]}}.Y
+; CHECK: CUBE T{{[0-9]}}.Z
+; CHECK: CUBE * T{{[0-9]}}.W
+define void @cube() #0 {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = fdiv float %3, %1
+  %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %6 = extractelement <4 x float> %5, i32 1
+  %7 = fdiv float %6, %1
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %9 = extractelement <4 x float> %8, i32 2
+  %10 = fdiv float %9, %1
+  %11 = insertelement <4 x float> undef, float %4, i32 0
+  %12 = insertelement <4 x float> %11, float %7, i32 1
+  %13 = insertelement <4 x float> %12, float %10, i32 2
+  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
+  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
+  %16 = extractelement <4 x float> %15, i32 0
+  %17 = extractelement <4 x float> %15, i32 1
+  %18 = extractelement <4 x float> %15, i32 2
+  %19 = extractelement <4 x float> %15, i32 3
+  %20 = call float @fabs(float %18)
+  %21 = fdiv float 1.000000e+00, %20
+  %22 = fmul float %16, %21
+  %23 = fadd float %22, 1.500000e+00
+  %24 = fmul float %17, %21
+  %25 = fadd float %24, 1.500000e+00
+  %26 = insertelement <4 x float> undef, float %25, i32 0
+  %27 = insertelement <4 x float> %26, float %23, i32 1
+  %28 = insertelement <4 x float> %27, float %19, i32 2
+  %29 = insertelement <4 x float> %28, float %25, i32 3
+  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
+  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
new file mode 100644
index 000000000000..8b32f696449e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
+declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_unpack_byte0_to_float:
+; SI: v_cvt_f32_ubyte0
+define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte1_to_float:
+; SI: v_cvt_f32_ubyte1
+define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte2_to_float:
+; SI: v_cvt_f32_ubyte2
+define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_unpack_byte3_to_float:
+; SI: v_cvt_f32_ubyte3
+define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
new file mode 100644
index 000000000000..55ca9c7536e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
+declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
+
+; GCN-LABEL: {{^}}test_div_fixup_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fixup_f64:
+; GCN: v_div_fixup_f64
+define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
new file mode 100644
index 000000000000..bcb7f870f1f4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
@@ -0,0 +1,179 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; FIXME: Enable for VI.
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
+declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
+
+; GCN-LABEL: {{^}}test_div_fmas_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f64:
+; GCN: v_div_fmas_f64
+define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
+; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+  %cmp = icmp eq i32 %i, 0
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
+; SI: s_mov_b64 vcc, 0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
+; SI: s_mov_b64 vcc, -1
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
+; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}}
+; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
+; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
+; SI: s_endpgm
+define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  %cmp1 = icmp ne i32 %d, 0
+  %and = and i1 %cmp0, %cmp1
+
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
+; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
+
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+
+
+; SI: BB9_2:
+; SI: s_or_b64 exec, exec, [[SAVE]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v0
+; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
+  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
+  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+
+  %a = load float, float addrspace(1)* %gep.a
+  %b = load float, float addrspace(1)* %gep.b
+  %c = load float, float addrspace(1)* %gep.c
+
+  %cmp0 = icmp eq i32 %tid, 0
+  br i1 %cmp0, label %bb, label %exit
+
+bb:
+  %val = load i32, i32 addrspace(1)* %dummy
+  %cmp1 = icmp ne i32 %val, 0
+  br label %exit
+
+exit:
+  %cond = phi i1 [false, %entry], [%cmp1, %bb]
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
+  store float %result, float addrspace(1)* %gep.out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
new file mode 100644
index 000000000000..de830de039c7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll
@@ -0,0 +1,364 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
+declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; SI-LABEL @test_div_scale_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+
+  %a = load double, double addrspace(1)* %gep.0, align 8
+  %b = load double, double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %b = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %b = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %b = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %b = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+
+  %a = load double, double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_inline_imm_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_inline_imm_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_num:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_fabs_den:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll
new file mode 100644
index 000000000000..20c7af8ade5e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}s_flbit:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_flbit:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %val = load i32, i32 addrspace(1)* %valptr, align 4
+  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
new file mode 100644
index 000000000000..e098dd35d6da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+
+declare double @llvm.fabs.f64(double %Val)
+declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}fract_f64:
+; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %neg = fsub double 0.0, %val
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f64_neg_abs:
+; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
+; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
+; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
+; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3
+; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]]
+; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]]
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]]
+; CI: buffer_store_dwordx2 [[FRC]]
+define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind {
+  %val = load double, double addrspace(1)* %src, align 4
+  %abs = call double @llvm.fabs.f64(double %val)
+  %neg = fsub double 0.0, %abs
+  %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone
+  store double %fract, double addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
new file mode 100644
index 000000000000..7501b4b75465
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.fabs.f32(float  %Val)
+declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone
+
+; Legacy name
+declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}fract_f32:
+; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
+; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]]
+; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_neg:
+; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]]
+; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %neg = fsub float 0.0, %val
+  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fract_f32_neg_abs:
+; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
+; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]]
+; GCN: buffer_store_dword [[RESULT]]
+; EG: FRACT
+define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
+  %val = load float, float addrspace(1)* %src, align 4
+  %abs = call float @llvm.fabs.f32(float %val)
+  %neg = fsub float 0.0, %abs
+  %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone
+  store float %fract, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
new file mode 100644
index 000000000000..42102e30f071
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FIXME: Store of i32 seems to be broken pre-EG somehow?
+
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_imad24:
+; SI: v_mad_i32_i24
+; CM: MULADD_INT24
+; R600: MULLO_INT
+; R600: ADD_INT
+define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
new file mode 100644
index 000000000000..46662f96c290
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_imax:
+; SI: v_max_i32_e32
+define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_imax:
+; SI: s_max_i32
+define void @scalar_imax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.imax(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
new file mode 100644
index 000000000000..34b454e23755
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_imin:
+; SI: v_min_i32_e32
+define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_imin:
+; SI: s_min_i32
+define void @scalar_imin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.imin(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
new file mode 100644
index 000000000000..fdc1172260b9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_imul24:
+; SI: v_mul_i32_i24
+; CM: MUL_INT24
+; R600: MULLO_INT
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
new file mode 100644
index 000000000000..057708e7b5cc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
+
+define void @kill_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %3)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_vcc_implicit_def:
+; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 {
+entry:
+  %tmp0 = fcmp olt float %13, 0.0
+  call void @llvm.AMDGPU.kill(float %14)
+  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="2" }
+attributes #1 = { "ShaderType"="0" }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
new file mode 100644
index 000000000000..a59c0ce6d675
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
new file mode 100644
index 000000000000..4cafd563685e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32
+; EG: RECIPSQRT_IEEE
+define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
new file mode 100644
index 000000000000..83b56a5029d3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.AMDGPU.mul(float ,float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
new file mode 100644
index 000000000000..d2a655bf909c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE-NOT: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64
+; SI-SAFE: v_rcp_f64
+define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
new file mode 100644
index 000000000000..edd6e9a72f1b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll
@@ -0,0 +1,50 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f32:
+; SI: v_rcp_f32_e32
+; EG: RECIP_IEEE
+define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
+
+; SI-SAFE: v_rcp_f32_e32
+; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
+
+; EG: RECIP_IEEE
+
+define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rcp = fdiv float 1.0, %src
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
+; SI-UNSAFE: v_rsq_f32_e32
+; SI-SAFE: v_sqrt_f32_e32
+; SI-SAFE: v_rcp_f32_e32
+
+; EG: RECIPSQRT_IEEE
+define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
+  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
new file mode 100644
index 000000000000..67f1d22c7178
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_clamped_f64:
+; SI: v_rsq_clamp_f64_e32
+
+; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3]
+; TODO: this constant should be folded:
+; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
+; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
+; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
+; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
+; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+
+define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
+  store double %rsq_clamped, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
new file mode 100644
index 000000000000..eeff2536b232
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_clamped_f32:
+; SI: v_rsq_clamp_f32_e32
+
+; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}}
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
+; TODO: this constant should be folded:
+; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff
+; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]]
+
+; EG: RECIPSQRT_CLAMPED
+
+define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone
+  store float %rsq_clamped, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
new file mode 100644
index 000000000000..36b72f14db19
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
+
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
new file mode 100644
index 000000000000..10206609bb57
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll
@@ -0,0 +1,42 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
+;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
+   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
+   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
+   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
+   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
+   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
+   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
+   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
+   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
+   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
+   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
+   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
+   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
+   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
+   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
+   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
+   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
+   store <4 x float> %res16, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
new file mode 100644
index 000000000000..6b546a7e17c1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_trig_preop_f64:
+; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
+; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment:
+; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
+define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+  %a = load double, double addrspace(1)* %aptr, align 8
+  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
new file mode 100644
index 000000000000..74792e50017f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+; R600: {{^}}amdgpu_trunc:
+; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: {{^}}amdgpu_trunc:
+; SI: v_trunc_f32
+
+define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
+entry:
+  %0 = call float @llvm.AMDGPU.trunc(float %x)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
new file mode 100644
index 000000000000..77a073b0cb03
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}test_umad24:
+; SI: v_mad_u32_u24
+; EG: MULADD_UINT24
+; R600: MULLO_UINT
+; R600: ADD_INT
+define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1
+
+  %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
new file mode 100644
index 000000000000..a97d103016d3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_umax:
+; SI: v_max_u32_e32
+define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_umax:
+; SI: s_max_u32
+define void @scalar_umax(i32 %p0, i32 %p1) #0 {
+entry:
+  %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %max to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_zext_umax:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
+define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.umax(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
new file mode 100644
index 000000000000..2acd10e0c631
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}vector_umin:
+; SI: v_min_u32_e32
+define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
+main_body:
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}scalar_umin:
+; SI: s_min_u32
+define void @scalar_umin(i32 %p0, i32 %p1) #0 {
+entry:
+  %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
+  %bc = bitcast i32 %min to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc)
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_zext_umin:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
+define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = zext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = zext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; Function Attrs: readnone
+declare i32 @llvm.AMDGPU.umin(i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
new file mode 100644
index 000000000000..76624a078b3a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_umul24:
+; SI: v_mul_u32_u24
+; R600: MUL_UINT24
+; R600: MULLO_UINT
+define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
new file mode 100644
index 000000000000..3d05da616e4e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
@@ -0,0 +1,59 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
+
+;GCN-LABEL: {{^}}main:
+;GCN-NOT: s_wqm
+;GCN: s_mov_b32
+;GCN-NEXT: v_interp_mov_f32
+;GCN: v_interp_p1_f32
+;GCN: v_interp_p2_f32
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 {
+main_body:
+  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
+  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
+  ret void
+}
+
+; Thest that v_interp_p1 uses different source and destination registers
+; on 16 bank LDS chips.
+
+; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
+; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
+
+define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 {
+main_body:
+  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
+  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
+  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
+  %25 = call float @fabs(float %22)
+  %26 = call float @fabs(float %23)
+  %27 = call float @fabs(float %24)
+  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
+  %29 = bitcast i32 %28 to float
+  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
+  %31 = bitcast i32 %30 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.constant(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
new file mode 100644
index 000000000000..275cb580bc9b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
@@ -0,0 +1,509 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}gather4_v2:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl:
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l:
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b:
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl_v8:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz_v2:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_o:
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl_o:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_cl_o_v8:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_cl_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l_o:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_l_o_v8:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_l_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_o:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_o_v8:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_b_cl_o:
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_lz_o:
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_c:
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl_v8:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l_v8:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_v8:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_cl:
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz:
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+;CHECK-LABEL: {{^}}gather4_c_o:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_o_v8:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_cl_o:
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_l_o:
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_l_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_o:
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_b_cl_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz_o:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @gather4_c_lz_o_v8() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+
+declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
new file mode 100644
index 000000000000..06ee98e91b31
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
@@ -0,0 +1,45 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}getlod:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getlod_v2:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v2() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getlod_v4:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getlod_v4() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
new file mode 100644
index 000000000000..0fac8d799562
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.ll
@@ -0,0 +1,50 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load_mip() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getresinfo:
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getresinfo() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
new file mode 100644
index 000000000000..4bc638a28063
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
@@ -0,0 +1,310 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
new file mode 100644
index 000000000000..9d8935414ed9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
@@ -0,0 +1,310 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: s_wqm
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: s_wqm
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: s_wqm
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: s_wqm
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK-NOT: s_wqm
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
new file mode 100644
index 000000000000..b67716c3b665
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll
@@ -0,0 +1,132 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
+      <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
+      <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
+      <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
+      <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
+      <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
+      <32 x i8> undef, i32 6)
+   %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
+      <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
+      <32 x i8> undef, i32 11)
+   %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
+      <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
+      <32 x i8> undef, i32 16)
+   %e1 = extractelement <4 x i32> %res1, i32 0
+   %e2 = extractelement <4 x i32> %res2, i32 1
+   %e3 = extractelement <4 x i32> %res3, i32 2
+   %e4 = extractelement <4 x i32> %res4, i32 3
+   %t0 = extractelement <4 x i32> %res5, i32 0
+   %t1 = extractelement <4 x i32> %res5, i32 1
+   %e5 = add i32 %t0, %t1
+   %t2 = extractelement <4 x i32> %res6, i32 0
+   %t3 = extractelement <4 x i32> %res6, i32 2
+   %e6 = add i32 %t2, %t3
+   %t10 = extractelement <4 x i32> %res10, i32 2
+   %t11 = extractelement <4 x i32> %res10, i32 3
+   %e10 = add i32 %t10, %t11
+   %t12 = extractelement <4 x i32> %res11, i32 0
+   %t13 = extractelement <4 x i32> %res11, i32 1
+   %t14 = extractelement <4 x i32> %res11, i32 2
+   %t15 = add i32 %t12, %t13
+   %e11 = add i32 %t14, %t15
+   %t28 = extractelement <4 x i32> %res15, i32 0
+   %t29 = extractelement <4 x i32> %res15, i32 1
+   %t30 = extractelement <4 x i32> %res15, i32 2
+   %t31 = extractelement <4 x i32> %res15, i32 3
+   %t32 = add i32 %t28, %t29
+   %t33 = add i32 %t30, %t31
+   %e15 = add i32 %t32, %t33
+   %e16 = extractelement <4 x i32> %res16, i32 3
+   %s1 = add i32 %e1, %e2
+   %s2 = add i32 %s1, %e3
+   %s3 = add i32 %s2, %e4
+   %s4 = add i32 %s3, %e5
+   %s5 = add i32 %s4, %e6
+   %s9 = add i32 %s5, %e10
+   %s10 = add i32 %s9, %e11
+   %s14 = add i32 %s10, %e15
+   %s15 = add i32 %s14, %e16
+   %s16 = bitcast i32 %s15 to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
+   ret void
+}
+
+; Test that ccordinates are stored in vgprs and not sgprs
+; CHECK: vgpr_coords
+; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2
+  %22 = getelementptr float, float addrspace(2)* %21, i32 0
+  %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1
+  %24 = getelementptr float, float addrspace(2)* %21, i32 1
+  %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1
+  %26 = getelementptr float, float addrspace(2)* %21, i32 4
+  %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1
+  %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %23 to i32
+  %32 = bitcast float %25 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
+  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
+  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
+  %38 = extractelement <4 x i32> %37, i32 0
+  %39 = extractelement <4 x i32> %37, i32 1
+  %40 = extractelement <4 x i32> %37, i32 2
+  %41 = extractelement <4 x i32> %37, i32 3
+  %42 = bitcast i32 %38 to float
+  %43 = bitcast i32 %39 to float
+  %44 = bitcast i32 %40 to float
+  %45 = bitcast i32 %41 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
+  ret void
+}
+
+declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null}
+!1 = !{}
+!2 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
new file mode 100644
index 000000000000..f6c258539d5b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+; Example of a simple geometry shader loading vertex attributes from the
+; ESGS ring buffer
+
+; FIXME: Out of bounds immediate offset crashes
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc
+; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 {
+main_body:
+  %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = shl i32 %arg6, 2
+  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp15 = bitcast i32 %tmp14 to float
+  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp19 = bitcast i32 %tmp18 to float
+
+  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp21 = bitcast i32 %tmp20 to float
+
+  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp23 = bitcast i32 %tmp22 to float
+
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readonly }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
new file mode 100644
index 000000000000..ac95fd0b83a2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
+		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
+   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
+   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
+   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
+   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
+   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
+   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
+   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
+   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
+   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
+   %e1 = extractelement <4 x i32> %res1, i32 0
+   %e2 = extractelement <4 x i32> %res2, i32 1
+   %e3 = extractelement <4 x i32> %res3, i32 2
+   %e4 = extractelement <4 x i32> %res4, i32 3
+   %t0 = extractelement <4 x i32> %res5, i32 0
+   %t1 = extractelement <4 x i32> %res5, i32 1
+   %e5 = add i32 %t0, %t1
+   %t2 = extractelement <4 x i32> %res6, i32 0
+   %t3 = extractelement <4 x i32> %res6, i32 2
+   %e6 = add i32 %t2, %t3
+   %t4 = extractelement <4 x i32> %res7, i32 0
+   %t5 = extractelement <4 x i32> %res7, i32 3
+   %e7 = add i32 %t4, %t5
+   %t6 = extractelement <4 x i32> %res8, i32 1
+   %t7 = extractelement <4 x i32> %res8, i32 2
+   %e8 = add i32 %t6, %t7
+   %t8 = extractelement <4 x i32> %res9, i32 1
+   %t9 = extractelement <4 x i32> %res9, i32 3
+   %e9 = add i32 %t8, %t9
+   %t10 = extractelement <4 x i32> %res10, i32 2
+   %t11 = extractelement <4 x i32> %res10, i32 3
+   %e10 = add i32 %t10, %t11
+   %t12 = extractelement <4 x i32> %res11, i32 0
+   %t13 = extractelement <4 x i32> %res11, i32 1
+   %t14 = extractelement <4 x i32> %res11, i32 2
+   %t15 = add i32 %t12, %t13
+   %e11 = add i32 %t14, %t15
+   %t16 = extractelement <4 x i32> %res12, i32 0
+   %t17 = extractelement <4 x i32> %res12, i32 1
+   %t18 = extractelement <4 x i32> %res12, i32 3
+   %t19 = add i32 %t16, %t17
+   %e12 = add i32 %t18, %t19
+   %t20 = extractelement <4 x i32> %res13, i32 0
+   %t21 = extractelement <4 x i32> %res13, i32 2
+   %t22 = extractelement <4 x i32> %res13, i32 3
+   %t23 = add i32 %t20, %t21
+   %e13 = add i32 %t22, %t23
+   %t24 = extractelement <4 x i32> %res14, i32 1
+   %t25 = extractelement <4 x i32> %res14, i32 2
+   %t26 = extractelement <4 x i32> %res14, i32 3
+   %t27 = add i32 %t24, %t25
+   %e14 = add i32 %t26, %t27
+   %t28 = extractelement <4 x i32> %res15, i32 0
+   %t29 = extractelement <4 x i32> %res15, i32 1
+   %t30 = extractelement <4 x i32> %res15, i32 2
+   %t31 = extractelement <4 x i32> %res15, i32 3
+   %t32 = add i32 %t28, %t29
+   %t33 = add i32 %t30, %t31
+   %e15 = add i32 %t32, %t33
+   %e16 = extractelement <4 x i32> %res16, i32 3
+   %s1 = add i32 %e1, %e2
+   %s2 = add i32 %s1, %e3
+   %s3 = add i32 %s2, %e4
+   %s4 = add i32 %s3, %e5
+   %s5 = add i32 %s4, %e6
+   %s6 = add i32 %s5, %e7
+   %s7 = add i32 %s6, %e8
+   %s8 = add i32 %s7, %e9
+   %s9 = add i32 %s8, %e10
+   %s10 = add i32 %s9, %e11
+   %s11 = add i32 %s10, %e12
+   %s12 = add i32 %s11, %e13
+   %s13 = add i32 %s12, %e14
+   %s14 = add i32 %s13, %e15
+   %s15 = add i32 %s14, %e16
+   %s16 = bitcast i32 %s15 to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16)
+   ret void
+}
+
+declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
new file mode 100644
index 000000000000..ce9558cbf81d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll
@@ -0,0 +1,96 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
+
+; CHECK-LABEL: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
+define void @v1(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v2:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
+define void @v2(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v3:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+define void @v3(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v4:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
+define void @v4(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v5:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+define void @v5(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v6:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
+define void @v6(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v7:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
+define void @v7(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
new file mode 100644
index 000000000000..509c45f588b8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sample.ll
@@ -0,0 +1,160 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
+   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
+   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
+   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
+   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
+      <32 x i8> undef, <16 x i8> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
+      <32 x i8> undef, <16 x i8> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
+      <32 x i8> undef, <16 x i8> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
+      <32 x i8> undef, <16 x i8> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
+      <32 x i8> undef, <16 x i8> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
+      <32 x i8> undef, <16 x i8> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
+      <32 x i8> undef, <16 x i8> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
+      <32 x i8> undef, <16 x i8> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
+      <32 x i8> undef, <16 x i8> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
+      <32 x i8> undef, <16 x i8> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
+      <32 x i8> undef, <16 x i8> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
+      <32 x i8> undef, <16 x i8> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
+      <32 x i8> undef, <16 x i8> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
+      <32 x i8> undef, <16 x i8> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
+      <32 x i8> undef, <16 x i8> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
+      <32 x i8> undef, <16 x i8> undef, i32 16)
+   %e1 = extractelement <4 x float> %res1, i32 0
+   %e2 = extractelement <4 x float> %res2, i32 1
+   %e3 = extractelement <4 x float> %res3, i32 2
+   %e4 = extractelement <4 x float> %res4, i32 3
+   %t0 = extractelement <4 x float> %res5, i32 0
+   %t1 = extractelement <4 x float> %res5, i32 1
+   %e5 = fadd float %t0, %t1
+   %t2 = extractelement <4 x float> %res6, i32 0
+   %t3 = extractelement <4 x float> %res6, i32 2
+   %e6 = fadd float %t2, %t3
+   %t4 = extractelement <4 x float> %res7, i32 0
+   %t5 = extractelement <4 x float> %res7, i32 3
+   %e7 = fadd float %t4, %t5
+   %t6 = extractelement <4 x float> %res8, i32 1
+   %t7 = extractelement <4 x float> %res8, i32 2
+   %e8 = fadd float %t6, %t7
+   %t8 = extractelement <4 x float> %res9, i32 1
+   %t9 = extractelement <4 x float> %res9, i32 3
+   %e9 = fadd float %t8, %t9
+   %t10 = extractelement <4 x float> %res10, i32 2
+   %t11 = extractelement <4 x float> %res10, i32 3
+   %e10 = fadd float %t10, %t11
+   %t12 = extractelement <4 x float> %res11, i32 0
+   %t13 = extractelement <4 x float> %res11, i32 1
+   %t14 = extractelement <4 x float> %res11, i32 2
+   %t15 = fadd float %t12, %t13
+   %e11 = fadd float %t14, %t15
+   %t16 = extractelement <4 x float> %res12, i32 0
+   %t17 = extractelement <4 x float> %res12, i32 1
+   %t18 = extractelement <4 x float> %res12, i32 3
+   %t19 = fadd float %t16, %t17
+   %e12 = fadd float %t18, %t19
+   %t20 = extractelement <4 x float> %res13, i32 0
+   %t21 = extractelement <4 x float> %res13, i32 2
+   %t22 = extractelement <4 x float> %res13, i32 3
+   %t23 = fadd float %t20, %t21
+   %e13 = fadd float %t22, %t23
+   %t24 = extractelement <4 x float> %res14, i32 1
+   %t25 = extractelement <4 x float> %res14, i32 2
+   %t26 = extractelement <4 x float> %res14, i32 3
+   %t27 = fadd float %t24, %t25
+   %e14 = fadd float %t26, %t27
+   %t28 = extractelement <4 x float> %res15, i32 0
+   %t29 = extractelement <4 x float> %res15, i32 1
+   %t30 = extractelement <4 x float> %res15, i32 2
+   %t31 = extractelement <4 x float> %res15, i32 3
+   %t32 = fadd float %t28, %t29
+   %t33 = fadd float %t30, %t31
+   %e15 = fadd float %t32, %t33
+   %e16 = extractelement <4 x float> %res16, i32 3
+   %s1 = fadd float %e1, %e2
+   %s2 = fadd float %s1, %e3
+   %s3 = fadd float %s2, %e4
+   %s4 = fadd float %s3, %e5
+   %s5 = fadd float %s4, %e6
+   %s6 = fadd float %s5, %e7
+   %s7 = fadd float %s6, %e8
+   %s8 = fadd float %s7, %e9
+   %s9 = fadd float %s8, %e10
+   %s10 = fadd float %s9, %e11
+   %s11 = fadd float %s10, %e12
+   %s12 = fadd float %s11, %e13
+   %s13 = fadd float %s12, %e14
+   %s14 = fadd float %s13, %e15
+   %s15 = fadd float %s14, %e16
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
+   ret void
+}
+
+; CHECK: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+define void @v1(i32 %a1) #0 {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  %5 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
new file mode 100644
index 000000000000..f2badff2a99c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll
@@ -0,0 +1,143 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
+   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
+   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
+   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
+   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
+      <32 x i8> undef, <16 x i8> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
+      <32 x i8> undef, <16 x i8> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
+      <32 x i8> undef, <16 x i8> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
+      <32 x i8> undef, <16 x i8> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
+      <32 x i8> undef, <16 x i8> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
+      <32 x i8> undef, <16 x i8> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
+      <32 x i8> undef, <16 x i8> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
+      <32 x i8> undef, <16 x i8> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
+      <32 x i8> undef, <16 x i8> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
+      <32 x i8> undef, <16 x i8> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
+      <32 x i8> undef, <16 x i8> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
+      <32 x i8> undef, <16 x i8> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
+      <32 x i8> undef, <16 x i8> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
+      <32 x i8> undef, <16 x i8> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
+      <32 x i8> undef, <16 x i8> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
+      <32 x i8> undef, <16 x i8> undef, i32 16)
+   %e1 = extractelement <4 x float> %res1, i32 0
+   %e2 = extractelement <4 x float> %res2, i32 1
+   %e3 = extractelement <4 x float> %res3, i32 2
+   %e4 = extractelement <4 x float> %res4, i32 3
+   %t0 = extractelement <4 x float> %res5, i32 0
+   %t1 = extractelement <4 x float> %res5, i32 1
+   %e5 = fadd float %t0, %t1
+   %t2 = extractelement <4 x float> %res6, i32 0
+   %t3 = extractelement <4 x float> %res6, i32 2
+   %e6 = fadd float %t2, %t3
+   %t4 = extractelement <4 x float> %res7, i32 0
+   %t5 = extractelement <4 x float> %res7, i32 3
+   %e7 = fadd float %t4, %t5
+   %t6 = extractelement <4 x float> %res8, i32 1
+   %t7 = extractelement <4 x float> %res8, i32 2
+   %e8 = fadd float %t6, %t7
+   %t8 = extractelement <4 x float> %res9, i32 1
+   %t9 = extractelement <4 x float> %res9, i32 3
+   %e9 = fadd float %t8, %t9
+   %t10 = extractelement <4 x float> %res10, i32 2
+   %t11 = extractelement <4 x float> %res10, i32 3
+   %e10 = fadd float %t10, %t11
+   %t12 = extractelement <4 x float> %res11, i32 0
+   %t13 = extractelement <4 x float> %res11, i32 1
+   %t14 = extractelement <4 x float> %res11, i32 2
+   %t15 = fadd float %t12, %t13
+   %e11 = fadd float %t14, %t15
+   %t16 = extractelement <4 x float> %res12, i32 0
+   %t17 = extractelement <4 x float> %res12, i32 1
+   %t18 = extractelement <4 x float> %res12, i32 3
+   %t19 = fadd float %t16, %t17
+   %e12 = fadd float %t18, %t19
+   %t20 = extractelement <4 x float> %res13, i32 0
+   %t21 = extractelement <4 x float> %res13, i32 2
+   %t22 = extractelement <4 x float> %res13, i32 3
+   %t23 = fadd float %t20, %t21
+   %e13 = fadd float %t22, %t23
+   %t24 = extractelement <4 x float> %res14, i32 1
+   %t25 = extractelement <4 x float> %res14, i32 2
+   %t26 = extractelement <4 x float> %res14, i32 3
+   %t27 = fadd float %t24, %t25
+   %e14 = fadd float %t26, %t27
+   %t28 = extractelement <4 x float> %res15, i32 0
+   %t29 = extractelement <4 x float> %res15, i32 1
+   %t30 = extractelement <4 x float> %res15, i32 2
+   %t31 = extractelement <4 x float> %res15, i32 3
+   %t32 = fadd float %t28, %t29
+   %t33 = fadd float %t30, %t31
+   %e15 = fadd float %t32, %t33
+   %e16 = extractelement <4 x float> %res16, i32 3
+   %s1 = fadd float %e1, %e2
+   %s2 = fadd float %s1, %e3
+   %s3 = fadd float %s2, %e4
+   %s4 = fadd float %s3, %e5
+   %s5 = fadd float %s4, %e6
+   %s6 = fadd float %s5, %e7
+   %s7 = fadd float %s6, %e8
+   %s8 = fadd float %s7, %e9
+   %s9 = fadd float %s8, %e10
+   %s10 = fadd float %s9, %e11
+   %s11 = fadd float %s10, %e12
+   %s12 = fadd float %s11, %e13
+   %s13 = fadd float %s12, %e14
+   %s14 = fadd float %s13, %e15
+   %s15 = fadd float %s14, %e16
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
new file mode 100644
index 000000000000..2198590f2dfe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
@@ -0,0 +1,20 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}main:
+; BOTH: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; BOTH-NEXT: s_sendmsg Gs_done(nop)
+; BOTH-NEXT: s_endpgm
+
+define void @main(i32 inreg %a) #0 {
+main_body:
+  call void @llvm.SI.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #1
+
+attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
new file mode 100644
index 000000000000..09675d503355
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
@@ -0,0 +1,24 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg Gs(emit stream 0)
+; CHECK: s_sendmsg Gs(cut stream 1)
+; CHECK: s_sendmsg Gs(emit-cut stream 2)
+; CHECK: s_sendmsg Gs_done(nop)
+
+define void @main() {
+main_body:
+  call void @llvm.SI.sendmsg(i32 34, i32 0);
+  call void @llvm.SI.sendmsg(i32 274, i32 0);
+  call void @llvm.SI.sendmsg(i32 562, i32 0);
+  call void @llvm.SI.sendmsg(i32 3, i32 0);
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
new file mode 100644
index 000000000000..71f51548a5f8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -0,0 +1,47 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test1(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test2(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test3(i32 %a1, i32 %vaddr) #0 {
+    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test4(i32 %vdata, i32 %vaddr) #0 {
+    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
new file mode 100644
index 000000000000..f6e6d7050ba7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.SI.tid.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+
+;GCN: v_mbcnt_lo_u32_b32_e64
+;SI: v_mbcnt_hi_u32_b32_e32
+;VI: v_mbcnt_hi_u32_b32_e64
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call i32 @llvm.SI.tid()
+  %5 = bitcast i32 %4 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+declare i32 @llvm.SI.tid() readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
new file mode 100644
index 000000000000..036cd2ca82a6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+
+define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+  %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
+  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  store float %dp4, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
new file mode 100644
index 000000000000..42df6db1ccfd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kilp_gs_const:
+; SI: s_mov_b64 exec, 0
+define void @kilp_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kilp(float)
+
+attributes #0 = { "ShaderType"="2" }
+
+!0 = !{!"const", null, i32 1}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
new file mode 100644
index 000000000000..4e4c2ec7791a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_lrp:
+; SI: v_sub_f32
+; SI: v_mad_f32
+define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
+  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
+  store float %mad, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
new file mode 100644
index 000000000000..c65df8b3e8da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
+
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %cos = call float @llvm.cos.f32(float %x)
+   store float %cos, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+   %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
+   store <4 x float> %cos, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.cos.f32(float) readnone
+declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll
new file mode 100644
index 000000000000..42698925aae4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+;FUNC-LABEL: {{^}}test:
+;EG: EXP_IEEE
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.exp2.f32(float %in)
+   store float %0, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: {{^}}testv2:
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}testv4:
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+;EG: EXP_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
+;SI: v_exp_f32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.exp2.f32(float) readnone
+declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll
new file mode 100644
index 000000000000..c75e7850b353
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -0,0 +1,80 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+;FUNC-LABEL: {{^}}test:
+;EG: LOG_IEEE
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+
+define void @test(float addrspace(1)* %out, float %in) {
+entry:
+   %0 = call float @llvm.log2.f32(float %in)
+   store float %0, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: {{^}}testv2:
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
+
+define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}testv4:
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+;EG: LOG_IEEE
+; FIXME: We should be able to merge these packets together on Cayman so we
+; have a maximum of 4 instructions.
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
+;SI: v_log_f32
+define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.log2.f32(float) readnone
+declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
new file mode 100644
index 000000000000..e491732cf9c5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -0,0 +1,365 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FIXME: Use 64-bit ops
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.pow.ll b/test/CodeGen/AMDGPU/llvm.pow.ll
new file mode 100644
index 000000000000..c4ae652619c2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.pow.ll
@@ -0,0 +1,40 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-LABEL: test1:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+
+define void @test1(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+;CHECK-LABEL: test2:
+;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}},
+;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}},
+define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1)
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare float @llvm.pow.f32(float ,float ) readonly
+declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
new file mode 100644
index 000000000000..c63fb1727940
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rint_f64:
+; CI: v_rndne_f64_e32
+
+; SI-DAG: v_add_f64
+; SI-DAG: v_add_f64
+; SI-DAG v_cmp_gt_f64_e64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: s_endpgm
+define void @rint_f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.rint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v2f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v4f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+
+declare double @llvm.rint.f64(double) #0
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
new file mode 100644
index 000000000000..661db51ad032
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rint_f32:
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+define void @rint_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.rint.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v2f32:
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rint_v4f32:
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
+; R600: RNDNE
+
+; SI: v_rndne_f32_e32
+define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDIL.round.nearest.f32(float) #0
+declare float @llvm.rint.f32(float) #0
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
new file mode 100644
index 000000000000..3d0f57e33280
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f64:
+; SI: s_endpgm
+define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; This is a pretty large function, so just test a few of the
+; instructions that are necessary.
+
+; FUNC-LABEL: {{^}}v_round_f64:
+; SI: buffer_load_dwordx2
+; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+
+; SI-DAG: v_not_b32_e32
+; SI-DAG: v_not_b32_e32
+
+; SI-DAG: v_cmp_eq_i32
+
+; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
+; SI-DAG: v_cmp_gt_i32_e64
+; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
+
+; SI-DAG: v_cmp_gt_i32_e64
+
+
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %x = load double, double addrspace(1)* %gep
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v2f64:
+; SI: s_endpgm
+define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f64:
+; SI: s_endpgm
+define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f64:
+; SI: s_endpgm
+define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+declare double @llvm.round.f64(double) #1
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
+declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
+declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
new file mode 100644
index 000000000000..f5f124d915a5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f32:
+; SI-DAG: s_load_dword [[SX:s[0-9]+]]
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]|
+; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; SI: buffer_store_dword [[RESULT]]
+
+; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}},
+; R600-DAG: BFI_INT
+; R600-DAG: SETGE
+; R600-DAG: CNDE
+; R600-DAG: ADD
+define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+  %result = call float @llvm.round.f32(float %x) #1
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; The vector tests are really difficult to verify, since it can be hard to
+; predict how the scheduler will order the instructions.  We already have
+; a test for the scalar case, so the vector tests just check that the
+; compiler doesn't crash.
+
+; FUNC-LABEL: {{^}}round_v2f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float) #1
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
new file mode 100644
index 000000000000..3bb245c2e249
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
+
+; FUNC-LABEL: sin_f32
+; EG: MULADD_IEEE *
+; EG: FRACT *
+; EG: ADD *
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %sin = call float @llvm.sin.f32(float %x)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_3x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ef47644
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 3.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sin_2x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 2.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2sin_f32:
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %y = fmul float 2.0, %x
+   %sin = call float @llvm.sin.f32(float %y)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_v4f32:
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+   %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
+   store <4 x float> %sin, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.ll b/test/CodeGen/AMDGPU/llvm.sqrt.ll
new file mode 100644
index 000000000000..c6da047f5392
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI
+
+; R600-LABEL: {{^}}sqrt_f32:
+; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+; SI-LABEL: {{^}}sqrt_f32:
+; SI: v_sqrt_f32_e32
+define void @sqrt_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.sqrt.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-LABEL: {{^}}sqrt_v2f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+; SI-LABEL: {{^}}sqrt_v2f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-LABEL: {{^}}sqrt_v4f32:
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+; SI-LABEL: {{^}}sqrt_v4f32:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_ult:
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) {
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2:
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}elim_redun_check_v2_ult
+; SI: v_sqrt_f32_e32
+; SI: v_sqrt_f32_e32
+; SI-NOT: v_cndmask
+define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll
new file mode 100644
index 000000000000..0ca49fde3e7b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-i1.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}global_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_copy_i1_to_i1:
+; SI: ds_read_u8
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: ds_write_b8
+; SI: s_endpgm
+
+; EG: LDS_UBYTE_READ_RET
+; EG: AND_INT
+; EG: LDS_BYTE_WRITE
+define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind {
+  %load = load i1, i1 addrspace(3)* %in
+  store i1 %load, i1 addrspace(3)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}constant_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: AND_INT
+define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind {
+  %load = load i1, i1 addrspace(2)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; EG: VTX_READ_8
+; EG: BFE_INT
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_sextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; SI: buffer_load_ubyte
+; SI: v_bfe_i32
+; SI: v_ashrrev_i32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
new file mode 100644
index 000000000000..1daf0e6527b9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -0,0 +1,117 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = load <4 x float>, <4 x float> addrspace(8)* null
+  %13 = extractelement <4 x float> %12, i32 0
+  %14 = fmul float %0, %13
+  %15 = load <4 x float>, <4 x float> addrspace(8)* null
+  %16 = extractelement <4 x float> %15, i32 1
+  %17 = fmul float %0, %16
+  %18 = load <4 x float>, <4 x float> addrspace(8)* null
+  %19 = extractelement <4 x float> %18, i32 2
+  %20 = fmul float %0, %19
+  %21 = load <4 x float>, <4 x float> addrspace(8)* null
+  %22 = extractelement <4 x float> %21, i32 3
+  %23 = fmul float %0, %22
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %25 = extractelement <4 x float> %24, i32 0
+  %26 = fmul float %1, %25
+  %27 = fadd float %26, %14
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %29 = extractelement <4 x float> %28, i32 1
+  %30 = fmul float %1, %29
+  %31 = fadd float %30, %17
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %33 = extractelement <4 x float> %32, i32 2
+  %34 = fmul float %1, %33
+  %35 = fadd float %34, %20
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %37 = extractelement <4 x float> %36, i32 3
+  %38 = fmul float %1, %37
+  %39 = fadd float %38, %23
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = fmul float %2, %41
+  %43 = fadd float %42, %27
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %45 = extractelement <4 x float> %44, i32 1
+  %46 = fmul float %2, %45
+  %47 = fadd float %46, %31
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %49 = extractelement <4 x float> %48, i32 2
+  %50 = fmul float %2, %49
+  %51 = fadd float %50, %35
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %53 = extractelement <4 x float> %52, i32 3
+  %54 = fmul float %2, %53
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %57 = extractelement <4 x float> %56, i32 0
+  %58 = fmul float %3, %57
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %61 = extractelement <4 x float> %60, i32 1
+  %62 = fmul float %3, %61
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %65 = extractelement <4 x float> %64, i32 2
+  %66 = fmul float %3, %65
+  %67 = fadd float %66, %51
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %69 = extractelement <4 x float> %68, i32 3
+  %70 = fmul float %3, %69
+  %71 = fadd float %70, %55
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %75 = extractelement <4 x float> %74, i32 1
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %77 = extractelement <4 x float> %76, i32 2
+  %78 = insertelement <4 x float> undef, float %4, i32 0
+  %79 = insertelement <4 x float> %78, float %5, i32 1
+  %80 = insertelement <4 x float> %79, float %6, i32 2
+  %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3
+  %82 = insertelement <4 x float> undef, float %73, i32 0
+  %83 = insertelement <4 x float> %82, float %75, i32 1
+  %84 = insertelement <4 x float> %83, float %77, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85)
+  %87 = insertelement <4 x float> undef, float %86, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #3
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
new file mode 100644
index 000000000000..93b1b51a0d07
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load.ll
@@ -0,0 +1,709 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+;===------------------------------------------------------------------------===;
+; GLOBAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the global address space.
+; FUNC-LABEL: {{^}}load_i8:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8, i8 addrspace(1)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_sext:
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
+; SI: buffer_load_sbyte
+define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(1)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8:
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_sext:
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8:
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; R600: VTX_READ_8
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_sext:
+; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
+; R600-DAG: 8
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+; SI: buffer_load_sbyte
+define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the global address space.
+; FUNC-LABEL: {{^}}load_i16:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16	, i16	 addrspace(1)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext:
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 16
+; SI: buffer_load_sshort
+define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(1)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16:
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_sext:
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: 16
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16:
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; R600: VTX_READ_16
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_sext:
+; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal
+; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal
+; R600-DAG: 16
+; R600-DAG: 16
+; R600-DAG: 16
+; R600-DAG: 16
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+; SI: buffer_load_sshort
+define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the global address space.
+; FUNC-LABEL: {{^}}load_i32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: buffer_load_dword v{{[0-9]+}}
+define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the global address space.
+; FUNC-LABEL: {{^}}load_f32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: buffer_load_dword v{{[0-9]+}}
+define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float, float addrspace(1)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the global address space
+; FUNC-LABEL: {{^}}load_v2f32:
+; R600: MEM_RAT
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
+define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64:
+; R600: VTX_READ_64
+; SI: buffer_load_dwordx2
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64, i64 addrspace(1)* %in
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64_sext:
+; R600: MEM_RAT
+; R600: MEM_RAT
+; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
+; R600: 31
+; SI: buffer_load_dword
+
+define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = sext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i64_zext:
+; R600: MEM_RAT
+; R600: MEM_RAT
+define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = zext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v8i32:
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v16i32:
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; R600: VTX_READ_128
+; XXX: We should be using DWORDX4 instructions on SI.
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; CONSTANT ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load a sign-extended i8 value
+; FUNC-LABEL: {{^}}load_const_i8_sext:
+; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 8
+; SI: buffer_load_sbyte v{{[0-9]+}},
+define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(2)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i8 value
+; FUNC-LABEL: {{^}}load_const_i8_aligned:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(2)* %in
+  %1 = zext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i8 value
+; FUNC-LABEL: {{^}}load_const_i8_unaligned:
+; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ubyte v{{[0-9]+}},
+define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
+  %1 = load i8, i8 addrspace(2)* %0
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load a sign-extended i16 value
+; FUNC-LABEL: {{^}}load_const_i16_sext:
+; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; R600: 16
+; SI: buffer_load_sshort
+define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(2)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i16 value
+; FUNC-LABEL: {{^}}load_const_i16_aligned:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(2)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i16 value
+; FUNC-LABEL: {{^}}load_const_i16_unaligned:
+; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI: buffer_load_ushort
+define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
+  %1 = load i16, i16 addrspace(2)* %0
+  %2 = zext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an i32 value from the constant address space.
+; FUNC-LABEL: {{^}}load_const_addrspace_i32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: s_load_dword s{{[0-9]+}}
+define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(2)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load a f32 value from the constant address space.
+; FUNC-LABEL: {{^}}load_const_addrspace_f32:
+; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI: s_load_dword s{{[0-9]+}}
+define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
+  %1 = load float, float addrspace(2)* %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; LOCAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the local address space.
+; FUNC-LABEL: {{^}}load_i8_local:
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+  %1 = load i8, i8 addrspace(3)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i8_sext_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+entry:
+  %0 = load i8, i8 addrspace(3)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
+define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i8_sext_local:
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
+define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_local:
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; R600: LDS_UBYTE_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i8_sext_local:
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: LDS_UBYTE_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
+; SI: ds_read_i8
+define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the local address space.
+; FUNC-LABEL: {{^}}load_i16_local:
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16	, i16	 addrspace(3)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_i16_sext_local:
+; R600: LDS_USHORT_READ_RET
+; R600: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16, i16 addrspace(3)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_local:
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
+define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v2i16_sext_local:
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
+define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_local:
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; R600: LDS_USHORT_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}load_v4i16_sext_local:
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: LDS_USHORT_READ_RET
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; R600-DAG: BFE_INT
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
+; SI: ds_read_i16
+define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the local address space.
+; FUNC-LABEL: {{^}}load_i32_local:
+; R600: LDS_READ_RET
+; SI-NOT: s_wqm_b64
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
+define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the local address space.
+; FUNC-LABEL: {{^}}load_f32_local:
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b32
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
+entry:
+  %0 = load float, float addrspace(3)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the local address space
+; FUNC-LABEL: {{^}}load_v2f32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI: s_mov_b32 m0
+; SI: ds_read_b64
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; R600: LDS_READ_RET
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
+  %scalar = load i32, i32 addrspace(3)* %in
+  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+  %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+  %vec = add <2 x i32> %vec0, %vec1
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+@lds = addrspace(3) global [512 x i32] undef, align 4
+
+; On SI we need to make sure that the base offset is a register and not
+; an immediate.
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; SI: ds_read_b32 v0, v[[ZERO]] offset:4
+; R600: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32, i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load.vec.ll b/test/CodeGen/AMDGPU/load.vec.ll
new file mode 100644
index 000000000000..02f883cd8e9c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load.vec.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG  %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI  %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI  %s
+
+; load a v2i32 value from the global address space.
+; EG: {{^}}load_v2i32:
+; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v2i32:
+; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  store <2 x i32> %a, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load a v4i32 value from the global address space.
+; EG: {{^}}load_v4i32:
+; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
+; SI: {{^}}load_v4i32:
+; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
+define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/load64.ll b/test/CodeGen/AMDGPU/load64.ll
new file mode 100644
index 000000000000..74beabdc0076
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load64.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; load a f64 value from the global address space.
+; CHECK-LABEL: {{^}}load_f64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %1 = load double, double addrspace(1)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}load_i64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tmp = load i64, i64 addrspace(1)* %in
+  store i64 %tmp, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Load a f64 value from the constant address space.
+; CHECK-LABEL: {{^}}load_const_addrspace_f64:
+; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
+define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
+  %1 = load double, double addrspace(2)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
new file mode 100644
index 000000000000..33f3159d13eb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}local_i32_load
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
+; BOTH: buffer_store_dword [[REG]],
+define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %val = load i32, i32 addrspace(3)* %gep, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i32_load_0_offset
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
+; BOTH: buffer_store_dword [[REG]],
+define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+  %val = load i32, i32 addrspace(3)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
+; BOTH: buffer_store_byte [[REG]],
+define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
+  %val = load i8, i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
+; BOTH: buffer_store_byte [[REG]],
+define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
+  %val = load i8, i8 addrspace(3)* %gep, align 4
+  store i8 %val, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
+  %val = load i64, i64 addrspace(3)* %gep, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+  %val = load i64, i64 addrspace(3)* %in, align 8
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %in, i32 7
+  %val = load double, double addrspace(3)* %gep, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; BOTH: buffer_store_dwordx2 [[REG]],
+define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+  %val = load double, double addrspace(3)* %in, align 8
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
+  store i64 5678, i64 addrspace(3)* %gep, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+  store i64 1234, i64 addrspace(3)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
+define void @local_f64_store(double addrspace(3)* %out) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %out, i32 7
+  store double 16.0, double addrspace(3)* %gep, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_f64_store_0_offset
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+  store double 20.0, double addrspace(3)* %out, align 8
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v2i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
+; BOTH: s_endpgm
+define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
+  store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH: s_endpgm
+define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+  store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v4i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
+; BOTH: s_endpgm
+define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+  %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
+  store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
+  ret void
+}
+
+; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; BOTH: s_endpgm
+define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+  store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
new file mode 100644
index 000000000000..2aaf977ab903
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -0,0 +1,551 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
+; EG: LDS_WRXCHG_RET *
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
+; EG: LDS_WRXCHG_RET *
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
+; EG: LDS_SUB_RET *
+; GCN: ds_sub_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; EG: LDS_SUB_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
+; EG: LDS_AND_RET *
+; GCN: ds_and_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
+; EG: LDS_AND_RET *
+; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
+; EG: LDS_OR_RET *
+; GCN: ds_or_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
+; EG: LDS_OR_RET *
+; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
+; EG: LDS_XOR_RET *
+; GCN: ds_xor_rtn_b32
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
+; EG: LDS_XOR_RET *
+; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i32 %result, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
+; EG: LDS_MIN_INT_RET *
+; GCN: ds_min_rtn_i32
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
+; EG: LDS_MIN_INT_RET *
+; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
+; EG: LDS_MAX_INT_RET *
+; GCN: ds_max_rtn_i32
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
+; EG: LDS_MAX_INT_RET *
+; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
+; EG: LDS_MIN_UINT_RET *
+; GCN: ds_min_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
+; EG: LDS_MIN_UINT_RET *
+; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
+; EG: LDS_MAX_UINT_RET *
+; GCN: ds_max_rtn_u32
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
+; EG: LDS_MAX_UINT_RET *
+; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
+; GCN: s_load_dword [[SPTR:s[0-9]+]],
+; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; GCN: ds_add_u32 [[VPTR]], [[DATA]]
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
+; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
+; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
+; GCN: ds_sub_u32
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
+; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
+; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
+; GCN: ds_and_b32
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
+; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
+; GCN: ds_or_b32
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
+; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
+; GCN: ds_xor_b32
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
+; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
+; GCN: ds_min_i32
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
+; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
+; GCN: ds_max_i32
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
+; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
+; GCN: ds_min_u32
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
+; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
+; GCN: ds_max_u32
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
+; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
new file mode 100644
index 000000000000..0ffa5e751b7d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -0,0 +1,470 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; GCN: ds_add_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; GCN: ds_inc_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; GCN: ds_sub_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; GCN: ds_sub_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: buffer_store_dwordx2 [[RESULT]],
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; GCN: ds_dec_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; GCN: ds_and_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; GCN: ds_and_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; GCN: ds_or_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; GCN: ds_or_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; GCN: ds_xor_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; GCN: ds_xor_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; GCN: ds_min_rtn_i64
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; GCN: ds_min_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; GCN: ds_max_rtn_i64
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; GCN: ds_max_rtn_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; GCN: ds_min_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; GCN: ds_min_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; GCN: ds_max_rtn_u64
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; GCN: ds_max_rtn_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; GCN: ds_wrxchg_rtn_b64
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; GCN: ds_add_u64
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
+; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; GCN: ds_inc_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; GCN: ds_sub_u64
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; GCN: ds_sub_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
+; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
+; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; GCN: ds_dec_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; GCN: ds_and_b64
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; GCN: ds_and_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; GCN: ds_or_b64
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; GCN: ds_or_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; GCN: ds_xor_b64
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; GCN: ds_xor_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; GCN: ds_min_i64
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; GCN: ds_min_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; GCN: ds_max_i64
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; GCN: ds_max_i64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; GCN: ds_min_u64
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; GCN: ds_min_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; GCN: ds_max_u64
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; GCN: ds_max_u64 {{.*}} offset:32
+; GCN: s_endpgm
+define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
new file mode 100644
index 000000000000..f501a7ac6274
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s
+
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 8
+; GCN: .long 47180
+; GCN-NEXT: .long 38792
+
+; EG: {{^}}local_memory_two_objects:
+
+; We would like to check the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses, at different
+; constant offsets.
+; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
new file mode 100644
index 000000000000..9494ed75bd0c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
+
+
+; Check that the LDS size emitted correctly
+; EG: .long 166120
+; EG-NEXT: .long 128
+; SI: .long 47180
+; SI-NEXT: .long 71560
+; CI: .long 47180
+; CI-NEXT: .long 38792
+
+; FUNC-LABEL: {{^}}local_memory:
+
+; EG: LDS_WRITE
+; SI-NOT: s_wqm_b64
+; SI: ds_write_b32
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+; SI: s_barrier
+
+; EG: LDS_READ_RET
+; SI: ds_read_b32 {{v[0-9]+}},
+
+define void @local_memory(i32 addrspace(1)* %out) {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll
new file mode 100644
index 000000000000..f60d574497de
--- /dev/null
+++ b/test/CodeGen/AMDGPU/loop-address.ll
@@ -0,0 +1,34 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s
+
+;CHECK: ALU_PUSH
+;CHECK: LOOP_START_DX10 @11
+;CHECK: LOOP_BREAK @10
+;CHECK: POP @10
+
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+entry:
+  %cmp5 = icmp sgt i32 %iterations, 0
+  br i1 %cmp5, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
+  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.07 = add nsw i32 %i.07.in, -1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
+  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %ai.06, 1
+  %exitcond = icmp eq i32 %add, %iterations
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+
+!opencl.kernels = !{!0, !1, !2, !3}
+
+!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge}
+!1 = !{null}
+!2 = !{null}
+!3 = !{null}
diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll
new file mode 100644
index 000000000000..5fd9806813cd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -0,0 +1,51 @@
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+
+; Make sure loop-idiom doesn't create memcpy or memset.  There are no library
+; implementations of these for R600.
+
+; FUNC: @no_memcpy
+; R600-NOT: {{^}}llvm.memcpy
+; SI-NOT: {{^}}llvm.memcpy
+define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%4, %for.body]
+  %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0
+  %2 = getelementptr i8, i8* %dest, i32 %0
+  %3 = load i8, i8 addrspace(3)* %1
+  store i8 %3, i8* %2
+  %4 = add i32 %0, 1
+  %5 = icmp eq i32 %4, %size
+  br i1 %5, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; FUNC: @no_memset
+; R600-NOT: {{^}}llvm.memset
+; R600-NOT: {{^}}memset_pattern16:
+; SI-NOT: {{^}}llvm.memset
+; SI-NOT: {{^}}memset_pattern16:
+define void @no_memset(i32 %size) {
+entry:
+  %dest = alloca i8, i32 32
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%2, %for.body]
+  %1 = getelementptr i8, i8* %dest, i32 %0
+  store i8 0, i8* %1
+  %2 = add i32 %0, 1
+  %3 = icmp eq i32 %2, %size
+  br i1 %3, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll
new file mode 100644
index 000000000000..9ac988d38d1b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshl.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+
+define void @test(i32 %p) {
+   %i = mul i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll
new file mode 100644
index 000000000000..50e444ac26b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshr.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+
+define void @test(i32 %p) {
+   %i = udiv i32 %p, 2
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/m0-spill.ll b/test/CodeGen/AMDGPU/m0-spill.ll
new file mode 100644
index 000000000000..1dddc85f775d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/m0-spill.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+@lds = external addrspace(3) global [64 x float]
+
+; CHECK-LABEL: {{^}}main:
+; CHECK-NOT: v_readlane_b32 m0
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %cmp = fcmp ueq float 0.0, %4
+  br i1 %cmp, label %if, label %else
+
+if:
+  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %lds_data = load float, float addrspace(3)* %lds_ptr
+  br label %endif
+
+else:
+  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  br label %endif
+
+endif:
+  %export = phi float [%lds_data, %if], [%interp, %else]
+  %5 = call i32 @llvm.SI.packf16(float %export, float %export)
+  %6 = bitcast i32 %5 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
+  ret void
+}
+
+declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
new file mode 100644
index 000000000000..bc071628ead0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -0,0 +1,567 @@
+; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+
+; Make sure we don't form mad with denormals
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF-NOT: v_fma
+; SI-DENORM-SLOWFMAF-NOT: v_mad
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fadd float %mul, %c
+  %fma1 = fadd float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fadd float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %mul, %c
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %mul, %c
+  %fma1 = fsub float %mul, %d
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %fma = fsub float %c, %mul
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %fma0 = fsub float %c, %mul
+  %fma1 = fsub float %d, %mul
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma = fsub float %mul.neg, %c
+
+  store float %fma, float addrspace(1)* %gep.out
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul.neg, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+  %d = load float, float addrspace(1)* %gep.3
+
+  %mul = fmul float %a, %b
+  %mul.neg = fsub float -0.0, %mul
+  %fma0 = fsub float %mul.neg, %c
+  %fma1 = fsub float %mul, %d
+
+  store float %fma0, float addrspace(1)* %gep.out.0
+  store float %fma1, float addrspace(1)* %gep.out.1
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
+  %tmp2 = fsub float %tmp1, %z
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+; fold (fsub x, (fmuladd y, z, (fmul u, v)))
+;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
+  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %x = load float, float addrspace(1)* %gep.0
+  %y = load float, float addrspace(1)* %gep.1
+  %z = load float, float addrspace(1)* %gep.2
+  %u = load float, float addrspace(1)* %gep.3
+  %v = load float, float addrspace(1)* %gep.4
+
+  %tmp0 = fmul float %u, %v
+  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub float %x, %tmp1
+
+  store float %tmp2, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll
new file mode 100644
index 000000000000..aa4194ff6106
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad-sub.ll
@@ -0,0 +1,215 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}mad_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %c, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_f64:
+; SI: v_mul_f64
+; SI: v_add_f64
+define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load double, double addrspace(1)* %gep0, align 8
+  %b = load double, double addrspace(1)* %gep1, align 8
+  %c = load double, double addrspace(1)* %gep2, align 8
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  store double %sub, double addrspace(1)* %outgep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c.abs
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %c.abs, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}neg_neg_mad_f32:
+; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %nega = fsub float -0.000000e+00, %a
+  %negb = fsub float -0.000000e+00, %b
+  %mul = fmul float %nega, %negb
+  %sub = fadd float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_fabs_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load float, float addrspace(1)* %gep0, align 4
+  %b = load float, float addrspace(1)* %gep1, align 4
+  %c = load float, float addrspace(1)* %gep2, align 4
+  %b.abs = call float @llvm.fabs.f32(float %b) #0
+  %mul = fmul float %a, %b.abs
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %r2, %add
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float, float addrspace(1)* %gep.0
+  %r2 = load float, float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %add, %r2
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
new file mode 100644
index 000000000000..86d75a63ca40
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}i32_mad24:
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG: MULLO_INT
+; Make sure we aren't masking the inputs.
+; CM-NOT: AND
+; CM: MULADD_INT24
+; SI-NOT: and
+; SI: v_mad_i32_i24
+define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @test_imul24
+; SI: v_mad_i32_i24
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  %add = add i32 %mul, %src2
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
new file mode 100644
index 000000000000..95fe34119596
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}u32_mad24:
+; EG: MULADD_UINT24
+; SI: v_mad_u32_u24
+
+define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_mad24:
+; The order of A and B does not matter.
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+
+define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = add i16 %0, %c
+  %2 = sext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mad24:
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+
+define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = add i8 %0, %c
+  %2 = sext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; This tests for a bug where the mad_u24 pattern matcher would call
+; SimplifyDemandedBits on the first operand of the mul instruction
+; assuming that the pattern would be matched to a 24-bit mad.  This
+; led to some instructions being incorrectly erased when the entire
+; 24-bit mad pattern wasn't being matched.
+
+; Check that the select instruction is not deleted.
+; FUNC-LABEL: {{^}}i24_i32_i32_mad:
+; EG: CNDE_INT
+; SI: v_cndmask
+define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %0 = ashr i32 %a, 8
+  %1 = icmp ne i32 %c, 0
+  %2 = select i1 %1, i32 %0, i32 34
+  %3 = mul i32 %2, %c
+  %4 = add i32 %3, %d
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
new file mode 100644
index 000000000000..933bb016d2c9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -0,0 +1,193 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Enable VI
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madak_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000
+define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure this is only folded with one use. This is a code size
+; optimization and if we fold the immediate multiple times, we'll undo
+; it.
+
+; GCN-LABEL: {{^}}madak_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]]
+; GCN: s_endpgm
+define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, %b
+  %mul1 = fmul float %a, %c
+  %madak0 = fadd float %mul0, 10.0
+  %madak1 = fadd float %mul1, 10.0
+
+  store float %madak0, float addrspace(1)* %out.gep.0, align 4
+  store float %madak1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float 4.0, %a
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure nothing weird happens with a value that is also allowed as
+; an inline immediate.
+
+; GCN-LABEL: {{^}}madak_inline_imm_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 4.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We can't use an SGPR when forming madak
+; GCN-LABEL: {{^}}s_v_madak_f32:
+; GCN: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]]
+define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: @v_s_madak_f32
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]]
+define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madak_f32:
+; GCN-NOT: v_madak_f32
+; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %mul = fmul float %a, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, %b
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
+; GCN: buffer_load_dword [[VA:v[0-9]+]]
+; GCN: buffer_load_dword [[VB:v[0-9]+]]
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GCN: s_endpgm
+define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %in.a.gep, align 4
+  %b = load float, float addrspace(1)* %in.b.gep, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, %b.fabs
+  %madak = fadd float %mul, 10.0
+  store float %madak, float addrspace(1)* %out.gep, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
new file mode 100644
index 000000000000..ba7bb221a99a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -0,0 +1,205 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; GCN-LABEL: {{^}}madmk_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_2_use_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]]
+; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]]
+; GCN: s_endpgm
+define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
+
+  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
+
+  %a = load float, float addrspace(1)* %in.gep.0, align 4
+  %b = load float, float addrspace(1)* %in.gep.1, align 4
+  %c = load float, float addrspace(1)* %in.gep.2, align 4
+
+  %mul0 = fmul float %a, 10.0
+  %mul1 = fmul float %a, 10.0
+  %madmk0 = fadd float %mul0, %b
+  %madmk1 = fadd float %mul1, %c
+
+  store float %madmk0, float addrspace(1)* %out.gep.0, align 4
+  store float %madmk1, float addrspace(1)* %out.gep.1, align 4
+  ret void
+}
+
+; We don't get any benefit if the constant is an inline immediate.
+; GCN-LABEL: {{^}}madmk_inline_imm_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]]
+define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %mul = fmul float %a, 4.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_s_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_vector_madmk_f32:
+; GCN-NOT: v_madmk_f32
+; GCN: v_mad_f32
+; GCN: s_endpgm
+define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %b = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
+
+  %mul = fmul float %a.fabs, 10.0
+  %madmk = fadd float %mul, %b
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
+; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
+define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+  %b = load float, float addrspace(1)* %gep.1, align 4
+
+  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, %b.fabs
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
+; GCN: buffer_load_dword [[A:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
+define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+
+  %a = load float, float addrspace(1)* %gep.0, align 4
+
+  %mul = fmul float %a, 10.0
+  %madmk = fadd float %mul, 2.0
+  store float %madmk, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_madmk_verifier_error:
+; SI: s_xor_b64
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c
+; SI: s_or_b64
+define void @kill_madmk_verifier_error() nounwind {
+bb:
+  br label %bb2
+
+bb1:                                              ; preds = %bb2
+  ret void
+
+bb2:                                              ; preds = %bb6, %bb
+  %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ]
+  %tmp3 = fsub float undef, %tmp
+  %tmp5 = fcmp oeq float %tmp3, 1.000000e+04
+  br i1 %tmp5, label %bb1, label %bb6
+
+bb6:                                              ; preds = %bb2
+  %tmp4 = fmul float %tmp, undef
+  %tmp7 = fmul float %tmp4, 0x40E55DD180000000
+  %tmp8 = fadd float %tmp7, undef
+  br label %bb2
+}
diff --git a/test/CodeGen/AMDGPU/max-literals.ll b/test/CodeGen/AMDGPU/max-literals.ll
new file mode 100644
index 000000000000..c357524b140f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max-literals.ll
@@ -0,0 +1,67 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: ADD *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 5.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}main2:
+; CHECK-NOT: ADD *
+
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 2.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
new file mode 100644
index 000000000000..fef3e2f0a21c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -0,0 +1,168 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax_sge_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sge_i32
+; SI: s_max_i32
+define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sge i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32:
+; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
+define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sgt i32 %a, 9
+  %val = select i1 %cmp, i32 %a, i32 9
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imax_sgt_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sgt_i32
+; SI: s_max_i32
+define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_uge_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_uge_i32
+; SI: s_max_u32
+define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_ugt_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_ugt_i32
+; SI: s_max_u32
+define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure redundant and removed
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+  %a.ext = zext i16 %a to i32
+  %b.ext = zext i16 %b to i32
+  %cmp = icmp ugt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %mask = and i32 %val, 65535
+  store i32 %mask, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure redundant sign_extend_inreg removed.
+
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+  %a.ext = sext i16 %a to i32
+  %b.ext = sext i16 %b to i32
+  %cmp = icmp sgt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %shl = shl i32 %val, 16
+  %sextinreg = ashr i32 %shl, 16
+  store i32 %sextinreg, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should get match min/max through extends inserted by
+; legalization.
+
+; FUNC-LABEL: {{^}}s_test_imin_sge_i16:
+; SI: s_sext_i32_i16
+; SI: s_sext_i32_i16
+; SI: v_cmp_ge_i32_e32
+; SI: v_cndmask_b32
+define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+  %cmp = icmp sge i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
new file mode 100644
index 000000000000..cfb94b272e51
--- /dev/null
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax3_sgt_i32
+; SI: v_max3_i32
+define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp sgt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp sgt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax3_ugt_i32
+; SI: v_max3_u32
+define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ugt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ugt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
new file mode 100644
index 000000000000..34a2fc7ffa74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -0,0 +1,633 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+; Run with devices with different unaligned load restrictions.
+
+; TODO: Vector element tests
+; TODO: Non-zero base offset for load and store combinations
+; TODO: Same base addrspacecasted
+
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 0, i16 addrspace(1)* %out.gep.1
+  store i16 0, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
+; GCN: buffer_store_short
+; GCN: buffer_store_short
+; GCN: s_endpgm
+define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
+; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
+  store float 1.0, float addrspace(1)* %out.gep.1.bc
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
+; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
+; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %out.gep.1.bc
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
+; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 333, i32 addrspace(1)* %out.gep.3
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dwordx2 v
+define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 8.0, float addrspace(1)* %out
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; First store is out of order. Because of order of combines, the
+; consecutive store fails because only some of the stores have been
+; replaced with integer constant stores, and then won't merge because
+; the types are different.
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
+; SI-DAG: buffer_store_dwordx2
+; SI-DAG: buffer_store_dword
+; SI-NOT: buffer_store_dword
+; GCN: s_endpgm
+define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
+; XGCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
+; XGCN: buffer_store_dwordx4
+; XGCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
+  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out.gep.2
+  store i64 333, i64 addrspace(1)* %out.gep.3
+  store i64 1234, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx2 [[LOAD]]
+define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
+define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %lo = load i32, i32 addrspace(1)* %in.gep.0
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out.gep.0
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %hi, i32 addrspace(1)* %out
+  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
+; SI-DAG: buffer_load_dwordx2
+; SI-DAG: buffer_load_dword v
+; GCN: s_waitcnt
+; SI-DAG: buffer_store_dword v
+; SI-DAG: buffer_store_dwordx2 v
+; GCN: s_endpgm
+define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
+
+  %x = load float, float addrspace(1)* %in
+  %y = load float, float addrspace(1)* %in.gep.1
+  %z = load float, float addrspace(1)* %in.gep.2
+  %w = load float, float addrspace(1)* %in.gep.3
+
+  store float %x, float addrspace(1)* %out
+  store float %y, float addrspace(1)* %out.gep.1
+  store float %z, float addrspace(1)* %out.gep.2
+  store float %w, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
+define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
+
+  %x = load i32, i32 addrspace(1)* %in.gep.0
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; GCN: s_barrier
+; GCN: buffer_store_dwordx4 [[LOAD]]
+define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %x, i32 addrspace(1)* %out
+
+  ret void
+}
+
+; TODO: Re-packing of loaded register required. Maybe an IR pass
+; should catch this?
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
+; GCN: s_barrier
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.AMDGPU.barrier.local() #1
+
+  store i32 %w, i32 addrspace(1)* %out
+  store i32 %z, i32 addrspace(1)* %out.gep.1
+  store i32 %y, i32 addrspace(1)* %out.gep.2
+  store i32 %x, i32 addrspace(1)* %out.gep.3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: buffer_store_dword [[LOAD]]
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in, align 4
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out, align 4
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; This works once AA is enabled on the subtarget
+; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
+; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
+; XGCN: buffer_store_dwordx4 [[LOAD]]
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+
+  %x = extractelement <4 x i32> %vec, i32 0
+  %y = extractelement <4 x i32> %vec, i32 1
+  %z = extractelement <4 x i32> %vec, i32 2
+  %w = extractelement <4 x i32> %vec, i32 3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
+; GCN: ds_write_b8
+; GCN: ds_write_b8
+; GCN: s_endpgm
+define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+
+  store i8 123, i8 addrspace(3)* %out.gep.1
+  store i8 456, i8 addrspace(3)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
+; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
+; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
+; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
+define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out.gep.2
+  store i32 333, i32 addrspace(3)* %out.gep.3
+  store i32 1234, i32 addrspace(3)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
+; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
+; GCN: buffer_store_dword v[[HI]]
+define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+  store i32 9, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 12, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 16, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 -12, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 11, i32 addrspace(1)* %idx4, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx2
+define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+  store i32 13, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 15, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 62, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 63, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 11, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 123, i32 addrspace(1)* %idx5, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx2
+; GCN: buffer_store_dword v
+define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+  store i32 34, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 999, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 65, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 33, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 98, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 91, i32 addrspace(1)* %idx5, align 4
+  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
+  store i32 212, i32 addrspace(1)* %idx6, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
+; XGCN: buffer_store_dwordx4
+; XGCN: buffer_store_dwordx4
+
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+; GCN: buffer_store_dword v
+define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+  store i32 34, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 999, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 65, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 33, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 98, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 91, i32 addrspace(1)* %idx5, align 4
+  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
+  store i32 212, i32 addrspace(1)* %idx6, align 4
+  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
+  store i32 999, i32 addrspace(1)* %idx7, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
new file mode 100644
index 000000000000..0332d1a8e407
--- /dev/null
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin_sle_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_sle_i32
+; SI: s_min_i32
+define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_slt_i32
+; SI: s_min_i32
+define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp slt i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
+; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+  %cmp = icmp sle i32 %a, 8
+  %val = select i1 %cmp, i32 %a, i32 8
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ule_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ule_i32
+; SI: s_min_u32
+define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ult_i32
+; SI: s_min_u32
+define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
+; SI-NOT: v_min
+; SI: v_cmp_lt_u32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_min
+; SI: s_endpgm
+define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid
+  %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep0, align 4
+  store i1 %cmp, i1 addrspace(1)* %outgep1
+  ret void
+}
+
+; Make sure redundant and removed
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+  %a.ext = zext i16 %a to i32
+  %b.ext = zext i16 %b to i32
+  %cmp = icmp ult i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %mask = and i32 %val, 65535
+  store i32 %mask, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure redundant sign_extend_inreg removed.
+
+; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; SI-NEXT: buffer_store_dword [[VMIN]]
+define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+  %a.ext = sext i16 %a to i32
+  %b.ext = sext i16 %b to i32
+  %cmp = icmp slt i32 %a.ext, %b.ext
+  %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
+  %shl = shl i32 %val, 16
+  %sextinreg = ashr i32 %shl, 16
+  store i32 %sextinreg, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should get match min/max through extends inserted by
+; legalization.
+
+; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
+; SI: s_sext_i32_i16
+; SI: s_sext_i32_i16
+; SI: v_cmp_le_i32_e32
+; SI: v_cndmask_b32
+define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+  %cmp = icmp sle i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
new file mode 100644
index 000000000000..38ef46d1bdd6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin3_slt_i32
+; SI: v_min3_i32
+define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp slt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_ult_i32
+; SI: v_min3_u32
+define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ult i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ult i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_umin_umin
+; SI: v_min_i32
+; SI: v_min3_i32
+define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %i1
+  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
+
+  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_2_uses
+; SI-NOT: v_min3
+define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32, i32 addrspace(1)* %gep0, align 4
+  %b = load i32, i32 addrspace(1)* %gep1, align 4
+  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %d = load i32, i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %c
+  %i2 = select i1 %icmp2, i32 %i0, i32 %c
+
+  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
+  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
new file mode 100644
index 000000000000..4af9cdf1b960
--- /dev/null
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
+
+; Make sure when the load from %ptr2 is folded the chain isn't lost,
+; resulting in losing the store to gptr
+
+; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: ds_read_b64
+; SI: buffer_store_dword
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
new file mode 100644
index 000000000000..b19163f294e0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -0,0 +1,183 @@
+; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+;;;==========================================================================;;;
+;;; MUBUF LOAD TESTS
+;;;==========================================================================;;;
+
+; MUBUF load with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: {{^}}mubuf_load0:
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
+define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with the largest possible immediate offset
+; CHECK-LABEL: {{^}}mubuf_load1:
+; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
+define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %1 = load i8, i8 addrspace(1)* %0
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: {{^}}mubuf_load2:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
+define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; MUBUF load with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: {{^}}mubuf_load3:
+; CHECK-NOT: ADD
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
+define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
+  %2 = load i32, i32 addrspace(1)* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}soffset_max_imm:
+; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
+define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+; Make sure immediates that aren't inline constants don't get folded into
+; the soffset operand.
+; FIXME: for this test we should be smart enough to shift the immediate into
+; the offset field.
+; CHECK-LABEL: {{^}}soffset_no_fold:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
+define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 {
+main_body:
+  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp2 = shl i32 %6, 2
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp4 = add i32 %6, 16
+  %tmp5 = bitcast float 0.0 to i32
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+;;;==========================================================================;;;
+;;; MUBUF STORE TESTS
+;;;==========================================================================;;;
+
+; MUBUF store with an immediate byte offset that fits into 12-bits
+; CHECK-LABEL: {{^}}mubuf_store0:
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
+define void @mubuf_store0(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with the largest possible immediate offset
+; CHECK-LABEL: {{^}}mubuf_store1:
+; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
+
+define void @mubuf_store1(i8 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
+  store i8 0, i8 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
+; CHECK-LABEL: {{^}}mubuf_store2:
+; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
+define void @mubuf_store2(i32 addrspace(1)* %out) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
+  store i32 0, i32 addrspace(1)* %0
+  ret void
+}
+
+; MUBUF store with a 12-bit immediate offset and a register offset
+; CHECK-LABEL: {{^}}mubuf_store3:
+; CHECK-NOT: ADD
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
+define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
+  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
+  store i32 0, i32 addrspace(1)* %1
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+  store i32 99, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
+define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
+; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
+; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
+define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_vgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
new file mode 100644
index 000000000000..94e0f96b323e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -0,0 +1,200 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; mul24 and mad24 are affected
+
+; FUNC-LABEL: {{^}}test_mul_v2i32:
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = mul <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_v4i32:
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = mul <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: s_mul_i32
+; SI: buffer_store_dword
+define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: v_mul_lo_i32
+; SI: buffer_store_dword
+define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
+; 32-bits of both arguments are sign bits.
+; FUNC-LABEL: {{^}}mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_i32
+define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = mul i64 %0, 80
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
+define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 80
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI: s_endpgm
+define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 9
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_mul_i32:
+; SI: s_load_dword [[SRC0:s[0-9]+]],
+; SI: s_load_dword [[SRC1:s[0-9]+]],
+; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %mul = mul i32 %a, %b
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i32:
+; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = mul i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; A standard 64-bit multiply.  The expansion should be around 6 instructions.
+; It would be difficult to match the expansion correctly without writing
+; a really complicated list of FileCheck expressions.  I don't want
+; to confuse people who may 'break' this test with a correct optimization,
+; so this test just uses FUNC-LABEL to make sure the compiler does not
+; crash with a 'failed to select' error.
+
+; FUNC-LABEL: {{^}}s_mul_i64:
+define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i64:
+; SI: v_mul_lo_i32
+define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul32_in_branch:
+; SI: s_mul_i32
+define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32, i32 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i32 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul64_in_branch:
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
new file mode 100644
index 000000000000..7609dcc87afa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}i32_mul24:
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG: MULLO_INT
+; Make sure we are not masking the inputs
+; CM-NOT: AND
+; CM: MUL_INT24
+; SI-NOT: and
+; SI: v_mul_i32_i24
+define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll
new file mode 100644
index 000000000000..e640a7cd69f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mul_uint24.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+
+; FUNC-LABEL: {{^}}u32_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_mul_u32_u24
+
+define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i16_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
+define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i8_mul24:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; The result must be sign-extended
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+
+define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply with 24-bit inputs and 64-bit output
+; FUNC_LABEL: {{^}}mul24_i64:
+; EG; MUL_UINT24
+; EG: MULHI
+; SI: v_mul_u32_u24
+; FIXME: SI support 24-bit mulhi
+; SI: v_mul_hi_u32
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = shl i64 %a, 40
+  %a_24 = lshr i64 %0, 40
+  %1 = shl i64 %b, 40
+  %b_24 = lshr i64 %1, 40
+  %2 = mul i64 %a_24, %b_24
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll
new file mode 100644
index 000000000000..29b0944a5533
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mulhu.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+
+define void @test(i32 %p) {
+   %i = udiv i32 %p, 3
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
new file mode 100644
index 000000000000..9a814b579deb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s
+; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s
+; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s
+
+@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
+
+; FUNC-LABEL: {{^}}load_extern_const_init:
+define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
+
+; FUNC-LABEL: {{^}}load_undef_const_init:
+define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+  %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
new file mode 100644
index 000000000000..e4328ecbaca8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -0,0 +1,191 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; Make sure we don't turn the 32-bit argument load into a 16-bit
+; load. There aren't extending scalar lods, so that would require
+; using a buffer_load instruction.
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
+; SI: s_load_dword s
+; SI: buffer_store_short v
+define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i16
+  store i16 %trunc, i16 addrspace(1)* %out
+  ret void
+}
+
+; It should be OK (and probably performance neutral) to reduce this,
+; but we don't know if the load is uniform yet.
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
+; SI: buffer_load_dword v
+; SI: buffer_store_short v
+define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i16
+  store i16 %trunc, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+  %trunc = trunc i32 %arg to i1
+  store i1 %trunc, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep.in
+  %trunc = trunc i32 %load to i1
+  store i1 %trunc, i1 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
+; SI: s_load_dword s
+; SI: buffer_store_dword v
+define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
+; SI: buffer_load_dword v
+; SI: buffer_store_dword v
+define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i32
+  store i32 %trunc, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; Might as well reduce to 8-bit loads.
+; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+  %trunc = trunc i16 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
+; SI: buffer_load_ubyte v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i16, i16 addrspace(1)* %gep.in
+  %trunc = trunc i16 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %srl = lshr i64 %arg, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %srl = lshr i64 %load, 32
+  %trunc = trunc i64 %srl to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
+; SI: s_load_dword s
+; SI: buffer_store_byte v
+define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+  %trunc = trunc i64 %arg to i8
+  store i8 %trunc, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
+; SI: buffer_load_dword v
+; SI: buffer_store_byte v
+define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
+  %load = load i64, i64 addrspace(1)* %gep.in
+  %trunc = trunc i64 %load to i8
+  store i8 %trunc, i8 addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
new file mode 100644
index 000000000000..816755efb07c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}fold_sgpr:
+; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
+define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+entry:
+  %tmp0 = icmp ne i32 %fold, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %offset = add i32 %fold, %id
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
+  store i32 0, i32 addrspace(1)* %tmp1
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_imm:
+; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
+define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+entry:
+  %fold = add i32 3, 2
+  %tmp0 = icmp ne i32 %cmp, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %id = call i32 @llvm.r600.read.tidig.x()
+  %val = or i32 %id, %fold
+  store i32 %val, i32 addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fold_64bit_constant_add:
+; CHECK-NOT: s_mov_b64
+; FIXME: It would be better if we could use v_add here and drop the extra
+; v_mov_b32 instructions.
+; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
+; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+
+define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+entry:
+  %tmp0 = add i64 %val, 1
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
+; Inline constants should always be folded.
+
+; CHECK-LABEL: {{^}}vector_inline:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+
+define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Immediates with one use should be folded
+; CHECK-LABEL: {{^}}imm_one_use:
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
+
+define void @imm_one_use(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = xor i32 %tmp0, 100
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+; CHECK-LABEL: {{^}}vector_imm:
+; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
+
+define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = add i32 %tmp0, 1
+  %tmp2 = add i32 %tmp0, 2
+  %tmp3 = add i32 %tmp0, 3
+  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
+  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
+  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll
new file mode 100644
index 000000000000..20420a84de6f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+
+; Make sure there isn't an extra space between the instruction name and first operands.
+
+; GCN-LABEL: {{^}}add_f32:
+; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; GCN: buffer_store_dword [[RESULT]],
+define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+  %result = fadd float %a, %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
new file mode 100644
index 000000000000..1c04090b407f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -0,0 +1,178 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}or_v2i32:
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = or <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}or_v4i32:
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = or <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_i32:
+; SI: s_or_b32
+define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %or = or i32 %a, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i32:
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+  %loada = load i32, i32 addrspace(1)* %a
+  %or = or i32 %loada, %b
+  store i32 %or, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_literal_i32:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
+define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+  %or = or i32 %a, 99999
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_literal_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32, i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 65535
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
+define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32, i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 4
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_or_i64:
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+
+; SI: s_or_b64
+define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %or = or i64 %a, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %loadb = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, %loadb
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
+define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+  %loada = load i64, i64 addrspace(1)* %a
+  %or = or i64 %loada, %b
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_or_i64_loadimm:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 22470723082367
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The or 0 should really be removed.
+; FUNC-LABEL: {{^}}vector_or_i64_imm:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
+; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: s_endpgm
+define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 8
+  %or = or i64 %loada, 8
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}trunc_i64_or_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %add = or i64 %b, %a
+  %trunc = trunc i64 %add to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}or_i1:
+; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float, float addrspace(1)* %in0
+  %b = load float, float addrspace(1)* %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %or = or i1 %acmp, %bcmp
+  %result = zext i1 %or to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_or_i1:
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+  %cmp0 = icmp eq i32 %a, %b
+  %cmp1 = icmp eq i32 %c, %d
+  %or = or i1 %cmp0, %cmp1
+  store i1 %or, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll
new file mode 100644
index 000000000000..49a7c0df748f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/packetizer.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK: {{^}}test:
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
+; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+
+define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+entry:
+  %shl = sub i32 32, %e
+  %x = add i32 %x_arg, 1
+  %x.0 = shl i32 %x, %shl
+  %x.1 = lshr i32 %x, %e
+  %x.2 = or i32 %x.0, %x.1
+  %y = add i32 %y_arg, 1
+  %y.0 = shl i32 %y, %shl
+  %y.1 = lshr i32 %y, %e
+  %y.2 = or i32 %y.0, %y.1
+  %z = add i32 %z_arg, 1
+  %z.0 = shl i32 %z, %shl
+  %z.1 = lshr i32 %z, %e
+  %z.2 = or i32 %z.0, %z.1
+  %w = add i32 %w_arg, 1
+  %w.0 = shl i32 %w, %shl
+  %w.1 = lshr i32 %w, %e
+  %w.2 = or i32 %w.0, %w.1
+  %xy = or i32 %x.2, %y.2
+  %zw = or i32 %z.2, %w.2
+  %xyzw = or i32 %xy, %zw
+  store i32 %xyzw, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
new file mode 100644
index 000000000000..f32b044198ab
--- /dev/null
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -0,0 +1,59 @@
+; Function Attrs: nounwind
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
+;
+; CFG flattening should use parallel-and mode to generate branch conditions and
+; then merge if-regions with the same bodies.
+;
+; CHECK: AND_INT
+; CHECK-NEXT: AND_INT
+; CHECK-NEXT: OR_INT
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.end6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  store i32 1, i32* %data, align 4
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.then5, %land.lhs.true3, %if.end
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
new file mode 100644
index 000000000000..1da1e91b8ab8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -0,0 +1,66 @@
+; Function Attrs: nounwind
+; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+;
+; CFG flattening should use parallel-or to generate branch conditions and
+; then merge if-regions with the same bodies.
+
+; FIXME: For some reason having the allocas here allowed the flatten cfg pass
+; to do its transfomation, however now that we are using local memory for
+; allocas, the transformation isn't happening.
+; XFAIL: *
+;
+; CHECK: OR_INT
+; CHECK-NEXT: OR_INT
+; CHECK-NEXT: OR_INT
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32, i32* %a0, align 4
+  %1 = load i32, i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.else
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32, i32* %c0, align 4
+  %3 = load i32, i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %land.lhs.true
+  br label %if.end
+
+if.else:                                          ; preds = %land.lhs.true, %entry
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32, i32* %a1, align 4
+  %5 = load i32, i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.else6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32, i32* %c1, align 4
+  %7 = load i32, i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.else6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  br label %if.end7
+
+if.else6:                                         ; preds = %land.lhs.true3, %if.end
+  store i32 1, i32* %data, align 4
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else6, %if.then5
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/predicate-dp4.ll b/test/CodeGen/AMDGPU/predicate-dp4.ll
new file mode 100644
index 000000000000..6bc187594359
--- /dev/null
+++ b/test/CodeGen/AMDGPU/predicate-dp4.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: PRED_SETE_INT * Pred,
+; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
+define void @main(<4 x float> inreg) #0 {
+main_body:
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %IF, label %ENDIF
+
+IF:                                             ; preds = %main_body
+  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
+  %6 = insertelement <4 x float> undef, float %5, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
new file mode 100644
index 000000000000..0ce74d97ba8e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
+
+; These tests make sure the compiler is optimizing branches using predicates
+; when it is legal to do so.
+
+; CHECK: {{^}}simple_if:
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}simple_if_else:
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ELSE
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ELSE:
+  %2 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}nested_if:
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: JUMP
+; CHECK: POP
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ENDIF
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ENDIF:
+  %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}nested_if_else:
+; CHECK: ALU_PUSH_BEFORE
+; CHECK: JUMP
+; CHECK: POP
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
+define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ELSE1
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ELSE1:
+  %4 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
new file mode 100644
index 000000000000..a008ac98a43b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s
+
+; This works because promote allocas pass replaces these with LDS atomics.
+
+; Private atomics have no real use, but at least shouldn't crash on it.
+define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic
+  %val = extractvalue { i32, i1 } %tmp4, 0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
new file mode 100644
index 000000000000..6b18a19f1956
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -0,0 +1,21 @@
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure promote alloca pass doesn't crash
+
+; CHECK: unsupported call
+
+declare i32 @foo(i32*) nounwind
+
+define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in
+  %val = call i32 @foo(i32* %tmp3) nounwind
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll
new file mode 100644
index 000000000000..1c5629780508
--- /dev/null
+++ b/test/CodeGen/AMDGPU/private-memory.ll
@@ -0,0 +1,313 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}mova_same_clause:
+
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32, i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32, i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32, i32* %a.indirect.ptr
+  %b.indirect = load i32, i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32, i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %b = load i32, i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32, i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32, i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32, i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}short_array:
+
+; R600: MOVA_INT
+
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i16]
+  %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1
+  store i16 0, i16* %1
+  store i16 1, i16* %2
+  %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index
+  %4 = load i16, i16* %3
+  %5 = sext i16 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}char_array:
+
+; R600: MOVA_INT
+
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [2 x i8]
+  %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1
+  store i8 0, i8* %1
+  store i8 1, i8* %2
+  %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index
+  %4 = load i8, i8* %3
+  %5 = sext i8 %4 to i32
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+
+}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-NOT: MOV * TO.X
+
+; SI-NOT: v_mov_b32_e{{(32|64)}} v0
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [2 x i32]
+  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
+  %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
+  store i32 0, i32* %1
+  store i32 1, i32* %2
+  %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
+  %4 = load i32, i32* %3
+  %5 = call i32 @llvm.r600.read.tidig.x()
+  %6 = add i32 %4, %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: {{^}}no_overlap:
+; R600_CHECK: MOV
+; R600_CHECK: [[CHAN:[XYZW]]]+
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = alloca [3 x i8], align 1
+  %1 = alloca [2 x i8], align 1
+  %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
+  %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
+  %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
+  %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
+  %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
+  store i8 0, i8* %2
+  store i8 1, i8* %3
+  store i8 2, i8* %4
+  store i8 1, i8* %5
+  store i8 0, i8* %6
+  %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
+  %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
+  %9 = load i8, i8* %7
+  %10 = load i8, i8* %8
+  %11 = add i8 %9, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @char_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i8]]
+  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep0
+  store i8 1, i8* %gep1
+  %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i8, i8* %gep2
+  %sext = sext i8 %load to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i32]]
+  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x i64]]
+  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
+  %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 1, i64* %gep1
+  %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+%struct.pair32 = type { i32, i32 }
+
+define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x [2 x %struct.pair32]]
+  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %alloca = alloca [2 x %struct.pair32]
+  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
+  %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
+  store i32 0, i32* %gep0
+  store i32 1, i32* %gep1
+  %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
+  %load = load i32, i32* %gep2
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+entry:
+  %tmp = alloca [2 x i32]
+  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
+  %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
+  store i32 0, i32* %tmp1
+  store i32 1, i32* %tmp2
+  %cmp = icmp eq i32 %in, 0
+  %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
+  %load = load i32, i32* %sel
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
+  %tmp5 = load i32, i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/pv-packing.ll b/test/CodeGen/AMDGPU/pv-packing.ll
new file mode 100644
index 000000000000..abeae563ff3f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pv-packing.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+;CHECK: DOT4  T{{[0-9]\.X}}
+;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg2, i32 0
+  %4 = extractelement <4 x float> %reg2, i32 1
+  %5 = extractelement <4 x float> %reg2, i32 2
+  %6 = extractelement <4 x float> %reg3, i32 0
+  %7 = extractelement <4 x float> %reg3, i32 1
+  %8 = extractelement <4 x float> %reg3, i32 2
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+  %12 = fmul float %0, %3
+  %13 = fadd float %12, %6
+  %14 = fmul float %1, %4
+  %15 = fadd float %14, %7
+  %16 = fmul float %2, %5
+  %17 = fadd float %16, %8
+  %18 = fmul float %11, %11
+  %19 = fadd float %18, %0
+  %20 = insertelement <4 x float> undef, float %13, i32 0
+  %21 = insertelement <4 x float> %20, float %15, i32 1
+  %22 = insertelement <4 x float> %21, float %17, i32 2
+  %23 = insertelement <4 x float> %22, float %19, i32 3
+  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+  %25 = insertelement <4 x float> undef, float %24, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
new file mode 100644
index 000000000000..9a57dd19765a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -0,0 +1,241 @@
+; RUN: llc < %s -march=r600 | FileCheck %s
+
+; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
+; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = extractelement <4 x float> %reg4, i32 0
+  %13 = extractelement <4 x float> %reg4, i32 1
+  %14 = extractelement <4 x float> %reg4, i32 2
+  %15 = extractelement <4 x float> %reg4, i32 3
+  %16 = extractelement <4 x float> %reg5, i32 0
+  %17 = extractelement <4 x float> %reg5, i32 1
+  %18 = extractelement <4 x float> %reg5, i32 2
+  %19 = extractelement <4 x float> %reg5, i32 3
+  %20 = extractelement <4 x float> %reg6, i32 0
+  %21 = extractelement <4 x float> %reg6, i32 1
+  %22 = extractelement <4 x float> %reg6, i32 2
+  %23 = extractelement <4 x float> %reg6, i32 3
+  %24 = extractelement <4 x float> %reg7, i32 0
+  %25 = extractelement <4 x float> %reg7, i32 1
+  %26 = extractelement <4 x float> %reg7, i32 2
+  %27 = extractelement <4 x float> %reg7, i32 3
+  %28 = load <4 x float>, <4 x float> addrspace(8)* null
+  %29 = extractelement <4 x float> %28, i32 0
+  %30 = fmul float %0, %29
+  %31 = load <4 x float>, <4 x float> addrspace(8)* null
+  %32 = extractelement <4 x float> %31, i32 1
+  %33 = fmul float %0, %32
+  %34 = load <4 x float>, <4 x float> addrspace(8)* null
+  %35 = extractelement <4 x float> %34, i32 2
+  %36 = fmul float %0, %35
+  %37 = load <4 x float>, <4 x float> addrspace(8)* null
+  %38 = extractelement <4 x float> %37, i32 3
+  %39 = fmul float %0, %38
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %41 = extractelement <4 x float> %40, i32 0
+  %42 = fmul float %1, %41
+  %43 = fadd float %42, %30
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %45 = extractelement <4 x float> %44, i32 1
+  %46 = fmul float %1, %45
+  %47 = fadd float %46, %33
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %49 = extractelement <4 x float> %48, i32 2
+  %50 = fmul float %1, %49
+  %51 = fadd float %50, %36
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %53 = extractelement <4 x float> %52, i32 3
+  %54 = fmul float %1, %53
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %57 = extractelement <4 x float> %56, i32 0
+  %58 = fmul float %2, %57
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %61 = extractelement <4 x float> %60, i32 1
+  %62 = fmul float %2, %61
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %65 = extractelement <4 x float> %64, i32 2
+  %66 = fmul float %2, %65
+  %67 = fadd float %66, %51
+  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %69 = extractelement <4 x float> %68, i32 3
+  %70 = fmul float %2, %69
+  %71 = fadd float %70, %55
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %73 = extractelement <4 x float> %72, i32 0
+  %74 = fmul float %3, %73
+  %75 = fadd float %74, %59
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 1
+  %78 = fmul float %3, %77
+  %79 = fadd float %78, %63
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = fmul float %3, %81
+  %83 = fadd float %82, %67
+  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %85 = extractelement <4 x float> %84, i32 3
+  %86 = fmul float %3, %85
+  %87 = fadd float %86, %71
+  %88 = insertelement <4 x float> undef, float %4, i32 0
+  %89 = insertelement <4 x float> %88, float %5, i32 1
+  %90 = insertelement <4 x float> %89, float %6, i32 2
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
+  %92 = insertelement <4 x float> undef, float %4, i32 0
+  %93 = insertelement <4 x float> %92, float %5, i32 1
+  %94 = insertelement <4 x float> %93, float %6, i32 2
+  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
+  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
+  %97 = call float @fabs(float %96)
+  %98 = call float @llvm.AMDGPU.rsq.f32(float %97)
+  %99 = fmul float %4, %98
+  %100 = fmul float %5, %98
+  %101 = fmul float %6, %98
+  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %103 = extractelement <4 x float> %102, i32 0
+  %104 = fmul float %103, %8
+  %105 = fadd float %104, %20
+  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %107 = extractelement <4 x float> %106, i32 1
+  %108 = fmul float %107, %9
+  %109 = fadd float %108, %21
+  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %111 = extractelement <4 x float> %110, i32 2
+  %112 = fmul float %111, %10
+  %113 = fadd float %112, %22
+  %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
+  %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
+  %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
+  %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %119 = extractelement <4 x float> %118, i32 0
+  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %121 = extractelement <4 x float> %120, i32 1
+  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %123 = extractelement <4 x float> %122, i32 2
+  %124 = insertelement <4 x float> undef, float %99, i32 0
+  %125 = insertelement <4 x float> %124, float %100, i32 1
+  %126 = insertelement <4 x float> %125, float %101, i32 2
+  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
+  %128 = insertelement <4 x float> undef, float %119, i32 0
+  %129 = insertelement <4 x float> %128, float %121, i32 1
+  %130 = insertelement <4 x float> %129, float %123, i32 2
+  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
+  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
+  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %134 = extractelement <4 x float> %133, i32 0
+  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %136 = extractelement <4 x float> %135, i32 1
+  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %138 = extractelement <4 x float> %137, i32 2
+  %139 = insertelement <4 x float> undef, float %99, i32 0
+  %140 = insertelement <4 x float> %139, float %100, i32 1
+  %141 = insertelement <4 x float> %140, float %101, i32 2
+  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
+  %143 = insertelement <4 x float> undef, float %134, i32 0
+  %144 = insertelement <4 x float> %143, float %136, i32 1
+  %145 = insertelement <4 x float> %144, float %138, i32 2
+  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
+  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
+  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %149 = extractelement <4 x float> %148, i32 0
+  %150 = fmul float %149, %8
+  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %152 = extractelement <4 x float> %151, i32 1
+  %153 = fmul float %152, %9
+  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %155 = extractelement <4 x float> %154, i32 2
+  %156 = fmul float %155, %10
+  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %158 = extractelement <4 x float> %157, i32 0
+  %159 = fmul float %158, %12
+  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %161 = extractelement <4 x float> %160, i32 1
+  %162 = fmul float %161, %13
+  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %164 = extractelement <4 x float> %163, i32 2
+  %165 = fmul float %164, %14
+  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %167 = extractelement <4 x float> %166, i32 0
+  %168 = fmul float %167, %16
+  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %170 = extractelement <4 x float> %169, i32 1
+  %171 = fmul float %170, %17
+  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %173 = extractelement <4 x float> %172, i32 2
+  %174 = fmul float %173, %18
+  %175 = fcmp uge float %132, 0.000000e+00
+  %176 = select i1 %175, float %132, float 0.000000e+00
+  %177 = fcmp uge float %147, 0.000000e+00
+  %178 = select i1 %177, float %147, float 0.000000e+00
+  %179 = call float @llvm.pow.f32(float %178, float %24)
+  %180 = fcmp ult float %132, 0.000000e+00
+  %181 = select i1 %180, float 0.000000e+00, float %179
+  %182 = fadd float %150, %105
+  %183 = fadd float %153, %109
+  %184 = fadd float %156, %113
+  %185 = fmul float %176, %159
+  %186 = fadd float %185, %182
+  %187 = fmul float %176, %162
+  %188 = fadd float %187, %183
+  %189 = fmul float %176, %165
+  %190 = fadd float %189, %184
+  %191 = fmul float %181, %168
+  %192 = fadd float %191, %186
+  %193 = fmul float %181, %171
+  %194 = fadd float %193, %188
+  %195 = fmul float %181, %174
+  %196 = fadd float %195, %190
+  %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
+  %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
+  %200 = insertelement <4 x float> undef, float %75, i32 0
+  %201 = insertelement <4 x float> %200, float %79, i32 1
+  %202 = insertelement <4 x float> %201, float %83, i32 2
+  %203 = insertelement <4 x float> %202, float %87, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
+  %204 = insertelement <4 x float> undef, float %197, i32 0
+  %205 = insertelement <4 x float> %204, float %198, i32 1
+  %206 = insertelement <4 x float> %205, float %199, i32 2
+  %207 = insertelement <4 x float> %206, float %117, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #3
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/r600-encoding.ll b/test/CodeGen/AMDGPU/r600-encoding.ll
new file mode 100644
index 000000000000..3a82ee30a328
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-encoding.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s
+
+; The earliest R600 GPUs have a slightly different encoding than the rest of
+; the VLIW4/5 GPUs.
+
+; EG: {{^}}test:
+; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+; R600: {{^}}test:
+; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+define void @test(<4 x float> inreg %reg0) #0 {
+entry:
+  %r0 = extractelement <4 x float> %reg0, i32 0
+  %r1 = extractelement <4 x float> %reg0, i32 1
+  %r2 = fmul float %r0, %r1
+  %vec = insertelement <4 x float> undef, float %r2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/r600-export-fix.ll b/test/CodeGen/AMDGPU/r600-export-fix.ll
new file mode 100644
index 000000000000..7cb80195b368
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-export-fix.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+
+;CHECK:	EXPORT T{{[0-9]}}.XYZW
+;CHECK:	EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
+;CHECK: EXPORT T{{[0-9]}}.XYZW
+;CHECK: EXPORT T{{[0-9]}}.YZ00
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fmul float %5, %0
+  %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %8 = extractelement <4 x float> %7, i32 1
+  %9 = fmul float %8, %0
+  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %11 = extractelement <4 x float> %10, i32 2
+  %12 = fmul float %11, %0
+  %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %14 = extractelement <4 x float> %13, i32 3
+  %15 = fmul float %14, %0
+  %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %17 = extractelement <4 x float> %16, i32 0
+  %18 = fmul float %17, %1
+  %19 = fadd float %18, %6
+  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %1
+  %23 = fadd float %22, %9
+  %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %25 = extractelement <4 x float> %24, i32 2
+  %26 = fmul float %25, %1
+  %27 = fadd float %26, %12
+  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fmul float %29, %1
+  %31 = fadd float %30, %15
+  %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float %33, %2
+  %35 = fadd float %34, %19
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %37 = extractelement <4 x float> %36, i32 1
+  %38 = fmul float %37, %2
+  %39 = fadd float %38, %23
+  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %41 = extractelement <4 x float> %40, i32 2
+  %42 = fmul float %41, %2
+  %43 = fadd float %42, %27
+  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %45 = extractelement <4 x float> %44, i32 3
+  %46 = fmul float %45, %2
+  %47 = fadd float %46, %31
+  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %49 = extractelement <4 x float> %48, i32 0
+  %50 = fmul float %49, %3
+  %51 = fadd float %50, %35
+  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %53 = extractelement <4 x float> %52, i32 1
+  %54 = fmul float %53, %3
+  %55 = fadd float %54, %39
+  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %57 = extractelement <4 x float> %56, i32 2
+  %58 = fmul float %57, %3
+  %59 = fadd float %58, %43
+  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = fmul float %61, %3
+  %63 = fadd float %62, %47
+  %64 = load <4 x float>, <4 x float> addrspace(8)* null
+  %65 = extractelement <4 x float> %64, i32 0
+  %66 = load <4 x float>, <4 x float> addrspace(8)* null
+  %67 = extractelement <4 x float> %66, i32 1
+  %68 = load <4 x float>, <4 x float> addrspace(8)* null
+  %69 = extractelement <4 x float> %68, i32 2
+  %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %71 = extractelement <4 x float> %70, i32 0
+  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %73 = extractelement <4 x float> %72, i32 1
+  %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %75 = extractelement <4 x float> %74, i32 2
+  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 0
+  %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %79 = extractelement <4 x float> %78, i32 1
+  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = insertelement <4 x float> undef, float %51, i32 0
+  %83 = insertelement <4 x float> %82, float %55, i32 1
+  %84 = insertelement <4 x float> %83, float %59, i32 2
+  %85 = insertelement <4 x float> %84, float %63, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+  %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
+  %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+  %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
+  %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
+  %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+  %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %95 = insertelement <4 x float> %94, float %65, i32 1
+  %96 = insertelement <4 x float> %95, float %67, i32 2
+  %97 = insertelement <4 x float> %96, float %69, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+  %98 = insertelement <4 x float> undef, float %77, i32 0
+  %99 = insertelement <4 x float> %98, float %79, i32 1
+  %100 = insertelement <4 x float> %99, float %81, i32 2
+  %101 = insertelement <4 x float> %100, float %71, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+  %102 = insertelement <4 x float> undef, float %73, i32 0
+  %103 = insertelement <4 x float> %102, float %75, i32 1
+  %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
+  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+  %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
+  %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
+  %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+  %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
+  %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
+  %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
new file mode 100644
index 000000000000..f388f8ffe293
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -0,0 +1,58 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
+main_body:
+  %2 = extractelement <4 x float> %0, i32 0
+  %3 = extractelement <4 x float> %0, i32 1
+  %4 = extractelement <4 x float> %0, i32 2
+  %5 = extractelement <4 x float> %0, i32 3
+  %6 = insertelement <4 x float> undef, float %2, i32 0
+  %7 = insertelement <4 x float> %6, float %3, i32 1
+  %8 = insertelement <4 x float> %7, float %4, i32 2
+  %9 = insertelement <4 x float> %8, float %5, i32 3
+  %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = extractelement <4 x float> %10, i32 1
+  %13 = extractelement <4 x float> %10, i32 2
+  %14 = extractelement <4 x float> %10, i32 3
+  %15 = call float @fabs(float %13)
+  %16 = fdiv float 1.000000e+00, %15
+  %17 = fmul float %11, %16
+  %18 = fadd float %17, 1.500000e+00
+  %19 = fmul float %12, %16
+  %20 = fadd float %19, 1.500000e+00
+  %21 = insertelement <4 x float> undef, float %20, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %14, i32 2
+  %24 = insertelement <4 x float> %23, float %5, i32 3
+  %25 = extractelement <4 x float> %24, i32 0
+  %26 = extractelement <4 x float> %24, i32 1
+  %27 = extractelement <4 x float> %24, i32 2
+  %28 = extractelement <4 x float> %24, i32 3
+  %29 = insertelement <4 x float> undef, float %25, i32 0
+  %30 = insertelement <4 x float> %29, float %26, i32 1
+  %31 = insertelement <4 x float> %30, float %27, i32 2
+  %32 = insertelement <4 x float> %31, float %28, i32 3
+  %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13)
+  %34 = extractelement <4 x float> %33, i32 0
+  %35 = insertelement <4 x float> undef, float %34, i32 0
+  %36 = insertelement <4 x float> %35, float %34, i32 1
+  %37 = insertelement <4 x float> %36, float %34, i32 2
+  %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/r600cfg.ll b/test/CodeGen/AMDGPU/r600cfg.ll
new file mode 100644
index 000000000000..c7b9d65220f3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600cfg.ll
@@ -0,0 +1,119 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = bitcast float %0 to i32
+  %5 = icmp eq i32 %4, 0
+  %6 = sext i1 %5 to i32
+  %7 = bitcast i32 %6 to float
+  %8 = bitcast float %7 to i32
+  %9 = icmp ne i32 %8, 0
+  %. = select i1 %9, float 0x36A0000000000000, float %0
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP47, %main_body
+  %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ]
+  %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ]
+  %10 = bitcast float %temp4.1 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF41, label %ENDIF40
+
+IF41:                                             ; preds = %LOOP
+  %16 = insertelement <4 x float> undef, float %0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
+  %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+  %20 = insertelement <4 x float> undef, float %0, i32 0
+  %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
+  %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
+  %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+  %24 = insertelement <4 x float> undef, float %0, i32 0
+  %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
+  %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
+  %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+  %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
+  %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
+  %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+  %32 = insertelement <4 x float> undef, float %0, i32 0
+  %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
+  %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
+  %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+  ret void
+
+ENDIF40:                                          ; preds = %LOOP
+  %36 = bitcast float %temp8.0 to i32
+  %37 = add i32 %36, 1
+  %38 = bitcast i32 %37 to float
+  %39 = bitcast float %temp4.1 to i32
+  %40 = urem i32 %39, 2
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp eq i32 %42, 0
+  %44 = sext i1 %43 to i32
+  %45 = bitcast i32 %44 to float
+  %46 = bitcast float %45 to i32
+  %47 = icmp ne i32 %46, 0
+  %48 = bitcast float %temp4.1 to i32
+  br i1 %47, label %IF44, label %ELSE45
+
+IF44:                                             ; preds = %ENDIF40
+  %49 = udiv i32 %48, 2
+  br label %ENDIF43
+
+ELSE45:                                           ; preds = %ENDIF40
+  %50 = mul i32 3, %48
+  %51 = add i32 %50, 1
+  br label %ENDIF43
+
+ENDIF43:                                          ; preds = %ELSE45, %IF44
+  %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
+  %52 = bitcast i32 %.sink to float
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = bitcast float %54 to i32
+  br label %LOOP47
+
+LOOP47:                                           ; preds = %ENDIF48, %ENDIF43
+  %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ]
+  %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ]
+  %56 = bitcast float %temp28.0 to i32
+  %57 = icmp uge i32 %56, %55
+  %58 = sext i1 %57 to i32
+  %59 = bitcast i32 %58 to float
+  %60 = bitcast float %59 to i32
+  %61 = icmp ne i32 %60, 0
+  br i1 %61, label %LOOP, label %ENDIF48
+
+ENDIF48:                                          ; preds = %LOOP47
+  %62 = bitcast float %temp12.1 to i32
+  %63 = mul i32 %62, 2
+  %64 = bitcast i32 %63 to float
+  %65 = bitcast float %64 to i32
+  %66 = urem i32 %65, 2147483647
+  %67 = bitcast i32 %66 to float
+  %68 = bitcast float %temp28.0 to i32
+  %69 = add i32 %68, 1
+  %70 = bitcast i32 %69 to float
+  br label %LOOP47
+}
+
+declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll
new file mode 100644
index 000000000000..b4ac47afced7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reciprocal.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> inreg %reg0) #0  {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = fdiv float 1.0, %r0
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+   ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
new file mode 100644
index 000000000000..de6bfb310883
--- /dev/null
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() nounwind readnone
+
+; SI-LABEL: {{^}}foo:
+; SI: .section	.AMDGPU.csdata
+; SI: ; Kernel info:
+; SI: ; NumSgprs: {{[0-9]+}}
+; SI: ; NumVgprs: {{[0-9]+}}
+define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
+  %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid
+  %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %outptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}one_vgpr_used:
+; SI: NumVgprs: 1
+define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+  store i32 %x, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
new file mode 100644
index 000000000000..187650ff9a53
--- /dev/null
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: s_endpgm
+define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
+  %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
+  store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
+  store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+  %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
+  %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
+  store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
+  store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}no_reorder_extload_64:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI-NOT: ds_read
+; SI: ds_write_b64
+; SI: s_endpgm
+define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+  %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
+  %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
+  %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
+  %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
+  %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
+  %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
+  %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
+  store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
+  store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
new file mode 100644
index 000000000000..3f4ceb7e0310
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}s_rotl_i64:
+; BOTH-DAG: s_lshl_b64
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH: s_or_b64
+; BOTH: s_endpgm
+define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %0 = shl i64 %x, %y
+  %1 = sub i64 64, %y
+  %2 = lshr i64 %x, %1
+  %3 = or i64 %0, %2
+  store i64 %3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotl_i64:
+; SI-DAG: v_lshl_b64
+; VI-DAG: v_lshlrev_b64
+; BOTH-DAG: v_sub_i32
+; SI: v_lshr_b64
+; VI: v_lshrrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
+; BOTH: s_endpgm
+define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %tmp0 = shl i64 %x, %y
+  %tmp1 = sub i64 64, %y
+  %tmp2 = lshr i64 %x, %tmp1
+  %tmp3 = or i64 %tmp0, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
new file mode 100644
index 000000000000..6c144cd56ea7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rotl_i32:
+; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
+; R600-NEXT: 32
+; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+
+; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
+define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %0 = shl i32 %x, %y
+  %1 = sub i32 32, %y
+  %2 = lshr i32 %x, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotl_v2i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
+define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %0 = shl <2 x i32> %x, %y
+  %1 = sub <2 x i32> <i32 32, i32 32>, %y
+  %2 = lshr <2 x i32> %x, %1
+  %3 = or <2 x i32> %0, %2
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotl_v4i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
+define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %0 = shl <4 x i32> %x, %y
+  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %2 = lshr <4 x i32> %x, %1
+  %3 = or <4 x i32> %0, %2
+  store <4 x i32> %3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
new file mode 100644
index 000000000000..586de44a566c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
+
+; BOTH-LABEL: {{^}}s_rotr_i64:
+; BOTH-DAG: s_sub_i32
+; BOTH-DAG: s_lshr_b64
+; BOTH-DAG: s_lshl_b64
+; BOTH: s_or_b64
+define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+entry:
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotr_i64:
+; BOTH-DAG: v_sub_i32
+; SI-DAG: v_lshr_b64
+; SI-DAG: v_lshl_b64
+; VI-DAG: v_lshrrev_b64
+; VI-DAG: v_lshlrev_b64
+; BOTH: v_or_b32
+; BOTH: v_or_b32
+define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+entry:
+  %x = load i64, i64 addrspace(1)* %xptr, align 8
+  %y = load i64, i64 addrspace(1)* %yptr, align 8
+  %tmp0 = sub i64 64, %y
+  %tmp1 = shl i64 %x, %tmp0
+  %tmp2 = lshr i64 %x, %y
+  %tmp3 = or i64 %tmp1, %tmp2
+  store i64 %tmp3, i64 addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}s_rotr_v2i64:
+define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+entry:
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
+
+; BOTH-LABEL: {{^}}v_rotr_v2i64:
+define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+entry:
+  %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
+  %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
+  %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
+  %tmp1 = shl <2 x i64> %x, %tmp0
+  %tmp2 = lshr <2 x i64> %x, %y
+  %tmp3 = or <2 x i64> %tmp1, %tmp2
+  store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll
new file mode 100644
index 000000000000..044f9ffe6d63
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rotr.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}rotr_i32:
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+entry:
+  %tmp0 = sub i32 32, %y
+  %tmp1 = shl i32 %x, %tmp0
+  %tmp2 = lshr i32 %x, %y
+  %tmp3 = or i32 %tmp1, %tmp2
+  store i32 %tmp3, i32 addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotr_v2i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
+  %tmp1 = shl <2 x i32> %x, %tmp0
+  %tmp2 = lshr <2 x i32> %x, %y
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rotr_v4i32:
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+; R600: BIT_ALIGN_INT
+
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+entry:
+  %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
+  %tmp1 = shl <4 x i32> %x, %tmp0
+  %tmp2 = lshr <4 x i32> %x, %y
+  %tmp3 = or <4 x i32> %tmp1, %tmp2
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
new file mode 100644
index 000000000000..b67b800c7374
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; SI-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32
+; SI: s_endpgm
+define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float, float addrspace(1)* %in, align 4
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}rsq_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64_e32
+; SI: s_endpgm
+define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double, double addrspace(1)* %in, align 4
+  %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
+  %div = fdiv double 1.0, %sqrt
+  store double %div, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}rsq_f32_sgpr:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; SI: s_endpgm
+define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Recognize that this is rsqrt(a) * rcp(b) * c,
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+
+; SI-LABEL: @rsqrt_fmul
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+
+; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
+; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
+; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
+; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+; SI-UNSAFE: buffer_store_dword [[RESULT]]
+
+; SI-SAFE-NOT: v_rsq_f32
+
+; SI: s_endpgm
+define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
+
+  %a = load float, float addrspace(1)* %gep.0
+  %b = load float, float addrspace(1)* %gep.1
+  %c = load float, float addrspace(1)* %gep.2
+
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b
+  %z = fdiv float %c, %y
+  store float %z, float addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/rv7x0_count3.ll b/test/CodeGen/AMDGPU/rv7x0_count3.ll
new file mode 100644
index 000000000000..c3fd923e4593
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
+
+; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
+
+define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %1 = extractelement <4 x float> %reg1, i32 0
+   %2 = extractelement <4 x float> %reg1, i32 1
+   %3 = extractelement <4 x float> %reg1, i32 2
+   %4 = extractelement <4 x float> %reg1, i32 3
+   %5 = insertelement <4 x float> undef, float %1, i32 0
+   %6 = insertelement <4 x float> %5, float %2, i32 1
+   %7 = insertelement <4 x float> %6, float %3, i32 2
+   %8 = insertelement <4 x float> %7, float %4, i32 3
+   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
+   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
+   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
+   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
+   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
+   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
+   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
+   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
+   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
+   %19 = fadd <4 x float> %9, %10
+   %20 = fadd <4 x float> %19, %11
+   %21 = fadd <4 x float> %20, %12
+   %22 = fadd <4 x float> %21, %13
+   %23 = fadd <4 x float> %22, %14
+   %24 = fadd <4 x float> %23, %15
+   %25 = fadd <4 x float> %24, %16
+   %26 = fadd <4 x float> %25, %17
+   %27 = fadd <4 x float> %26, %18
+   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
new file mode 100644
index 000000000000..6b1a36c979c2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -0,0 +1,185 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_movk_i32_k0:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k1:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k2:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k3:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k4:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k5:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k6:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k7:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}s_movk_i32_k8:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k9:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k10:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k11:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k12:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64, i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
new file mode 100644
index 000000000000..f8ced7942a60
--- /dev/null
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}saddo_i64_zext:
+define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_saddo_i32:
+define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_saddo_i32:
+define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %sadd, 0
+  %carry = extractvalue { i32, i1 } %sadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_saddo_i64:
+define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_saddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %sadd, 0
+  %carry = extractvalue { i64, i1 } %sadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
new file mode 100644
index 000000000000..0b9649576545
--- /dev/null
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; In this test both the pointer and the offset operands to the
+; BUFFER_LOAD instructions end up being stored in vgprs.  This
+; requires us to add the pointer and offset together, store the
+; result in the offset operand (vaddr), and then store 0 in an
+; sgpr register pair and use that for the pointer operand
+; (low 64-bits of srsrc).
+
+; CHECK-LABEL: {{^}}mubuf:
+
+; Make sure we aren't using VGPRs for the source operand of s_mov_b64
+; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
+
+; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
+; instructions
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #1
+  %1 = call i32 @llvm.r600.read.tidig.y() #1
+  %2 = sext i32 %0 to i64
+  %3 = sext i32 %1 to i64
+  br label %loop
+
+loop:
+  %4 = phi i64 [0, %entry], [%5, %loop]
+  %5 = add i64 %2, %4
+  %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5
+  %7 = load i8, i8 addrspace(1)* %6, align 1
+  %8 = or i64 %5, 1
+  %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8
+  %10 = load i8, i8 addrspace(1)* %9, align 1
+  %11 = add i8 %7, %10
+  %12 = sext i8 %11 to i32
+  store i32 %12, i32 addrspace(1)* %out
+  %13 = icmp slt i64 %5, 10
+  br i1 %13, label %loop, label %done
+
+done:
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare i32 @llvm.r600.read.tidig.y() #1
+
+attributes #1 = { nounwind readnone }
+
+; Test moving an SMRD instruction to the VALU
+
+; CHECK-LABEL: {{^}}smrd_valu:
+; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
+; CHECK: buffer_store_dword [[OUT]]
+
+define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
+entry:
+  %0 = icmp ne i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
+  %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2
+  br label %endif
+
+endif:
+  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
+  %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000
+  %6 = load i32, i32 addrspace(2)* %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test moving ann SMRD with an immediate offset to the VALU
+
+; CHECK-LABEL: {{^}}smrd_valu2:
+; CHECK: buffer_load_dword
+define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %1 = add i32 %0, 4
+  %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4
+  %3 = load i32, i32 addrspace(2)* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v8i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
+  %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
+  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v16i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
+  %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
+  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
new file mode 100644
index 000000000000..0970e5d30630
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
+define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+  %bc = bitcast i32 %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
+define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tmp1 = load float, float addrspace(1)* %in, align 4
+  %bc = bitcast float %tmp1 to <2 x i16>
+  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
+; to produce one, but for some reason never made it to selection.
+
+
+; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
+;   %bc = bitcast i32 %tmp1 to <4 x i8>
+
+;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
+;   ret void
+; }
+
+; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
+;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
+;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
+;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
+;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
+;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
+;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
+
+; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
+;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
+;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
+;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
+;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
new file mode 100644
index 000000000000..11e8f5176f44
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -0,0 +1,82 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = fcmp ult float %1, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = fcmp ult float %0, 5.700000e+01
+  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
+  %11 = fsub float -0.000000e+00, %10
+  %12 = fptosi float %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %8 to i32
+  %15 = bitcast float %13 to i32
+  %16 = and i32 %14, %15
+  %17 = bitcast i32 %16 to float
+  %18 = bitcast float %17 to i32
+  %19 = icmp ne i32 %18, 0
+  %20 = fcmp ult float %0, 0.000000e+00
+  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fptosi float %22 to i32
+  %24 = bitcast i32 %23 to float
+  %25 = bitcast float %24 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %19, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  br i1 %26, label %ENDIF, label %ELSE17
+
+ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
+  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
+  %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
+  %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
+  %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %31 = insertelement <4 x float> undef, float %27, i32 0
+  %32 = insertelement <4 x float> %31, float %28, i32 1
+  %33 = insertelement <4 x float> %32, float %29, i32 2
+  %34 = insertelement <4 x float> %33, float %30, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  ret void
+
+ELSE17:                                           ; preds = %ELSE
+  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %38 = fadd float %35, 0x3FC99999A0000000
+  %39 = fadd float %36, 0x3FC99999A0000000
+  %40 = fadd float %37, 0x3FC99999A0000000
+  %41 = fadd float %38, 0x3FC99999A0000000
+  %42 = fadd float %39, 0x3FC99999A0000000
+  %43 = fadd float %40, 0x3FC99999A0000000
+  %44 = fadd float %41, 0x3FC99999A0000000
+  %45 = fadd float %42, 0x3FC99999A0000000
+  %46 = fadd float %43, 0x3FC99999A0000000
+  %47 = fadd float %44, 0x3FC99999A0000000
+  %48 = fadd float %45, 0x3FC99999A0000000
+  %49 = fadd float %46, 0x3FC99999A0000000
+  br label %ENDIF
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
+attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
new file mode 100644
index 000000000000..759197ca61f7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -0,0 +1,88 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = bitcast float %3 to i32
+  %5 = sdiv i32 %4, 4
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = mul i32 %7, 4
+  %9 = bitcast i32 %8 to float
+  %10 = bitcast float %9 to i32
+  %11 = sub i32 0, %10
+  %12 = bitcast i32 %11 to float
+  %13 = bitcast float %3 to i32
+  %14 = bitcast float %12 to i32
+  %15 = add i32 %13, %14
+  %16 = bitcast i32 %15 to float
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = load <4 x float>, <4 x float> addrspace(9)* null
+  %20 = extractelement <4 x float> %19, i32 1
+  %21 = load <4 x float>, <4 x float> addrspace(9)* null
+  %22 = extractelement <4 x float> %21, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %IF31, %main_body
+  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
+  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
+  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
+  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
+  %23 = bitcast float %temp12.0 to i32
+  %24 = bitcast float %6 to i32
+  %25 = icmp sge i32 %23, %24
+  %26 = sext i1 %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  %29 = icmp ne i32 %28, 0
+  br i1 %29, label %IF, label %LOOP29
+
+IF:                                               ; preds = %LOOP
+  %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %34 = insertelement <4 x float> undef, float %30, i32 0
+  %35 = insertelement <4 x float> %34, float %31, i32 1
+  %36 = insertelement <4 x float> %35, float %32, i32 2
+  %37 = insertelement <4 x float> %36, float %33, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  ret void
+
+LOOP29:                                           ; preds = %LOOP, %ENDIF30
+  %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
+  %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
+  %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
+  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
+  %38 = bitcast float %temp20.0 to i32
+  %39 = bitcast float %16 to i32
+  %40 = icmp sge i32 %38, %39
+  %41 = sext i1 %40 to i32
+  %42 = bitcast i32 %41 to float
+  %43 = bitcast float %42 to i32
+  %44 = icmp ne i32 %43, 0
+  br i1 %44, label %IF31, label %ENDIF30
+
+IF31:                                             ; preds = %LOOP29
+  %45 = bitcast float %temp12.0 to i32
+  %46 = add i32 %45, 1
+  %47 = bitcast i32 %46 to float
+  br label %LOOP
+
+ENDIF30:                                          ; preds = %LOOP29
+  %48 = bitcast float %temp20.0 to i32
+  %49 = add i32 %48, 1
+  %50 = bitcast i32 %49 to float
+  br label %LOOP29
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
new file mode 100644
index 000000000000..28cc08abc022
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -0,0 +1,55 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = load <4 x float>, <4 x float> addrspace(9)* null
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = load <4 x float>, <4 x float> addrspace(9)* null
+  %7 = extractelement <4 x float> %6, i32 1
+  %8 = load <4 x float>, <4 x float> addrspace(9)* null
+  %9 = extractelement <4 x float> %8, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
+  %10 = bitcast float %temp8.0 to i32
+  %11 = bitcast float %3 to i32
+  %12 = icmp sge i32 %10, %11
+  %13 = sext i1 %12 to i32
+  %14 = bitcast i32 %13 to float
+  %15 = bitcast float %14 to i32
+  %16 = icmp ne i32 %15, 0
+  br i1 %16, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %21 = insertelement <4 x float> undef, float %17, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %19, i32 2
+  %24 = insertelement <4 x float> %23, float %20, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %25 = bitcast float %temp8.0 to i32
+  %26 = add i32 %25, 1
+  %27 = bitcast i32 %26 to float
+  br label %LOOP
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
new file mode 100644
index 000000000000..3f728fd873b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FIXME: This currently doesn't do a great job of clustering the
+; loads, which end up with extra moves between them. Right now, it
+; seems the only things areLoadsFromSameBasePtr is accomplishing is
+; ordering the loads so that the lower address loads come first.
+
+; FUNC-LABEL: {{^}}cluster_global_arg_loads:
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
+; SI: buffer_store_dword [[REG0]]
+; SI: buffer_store_dword [[REG1]]
+define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+  %load0 = load i32, i32 addrspace(1)* %ptr, align 4
+  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1
+  %load1 = load i32, i32 addrspace(1)* %gep, align 4
+  store i32 %load0, i32 addrspace(1)* %out0, align 4
+  store i32 %load1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
+; an MUBUF load which does not have a vaddr operand.
+; FUNC-LABEL: {{^}}same_base_ptr_crash:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+entry:
+  %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
+  %tmp0 = load i32, i32 addrspace(1)* %out
+  %tmp1 = load i32, i32 addrspace(1)* %out1
+  %tmp2 = add i32 %tmp0, %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
new file mode 100644
index 000000000000..549465096833
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -0,0 +1,94 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = fadd float 1.000000e+03, %1
+  %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %4 = extractelement <4 x float> %3, i32 0
+  %5 = bitcast float %4 to i32
+  %6 = icmp eq i32 %5, 0
+  %7 = sext i1 %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %11 = call float @fabs(float %2)
+  %12 = fcmp ueq float %11, 0x7FF0000000000000
+  %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00
+  %14 = fsub float -0.000000e+00, %13
+  %15 = fptosi float %14 to i32
+  %16 = bitcast i32 %15 to float
+  %17 = bitcast float %16 to i32
+  %18 = icmp ne i32 %17, 0
+  %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00
+  %19 = fcmp une float %2, %2
+  %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00
+  %21 = fsub float -0.000000e+00, %20
+  %22 = fptosi float %21 to i32
+  %23 = bitcast i32 %22 to float
+  %24 = bitcast float %23 to i32
+  %25 = icmp ne i32 %24, 0
+  %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00
+  %26 = bitcast float %. to i32
+  %27 = sitofp i32 %26 to float
+  %28 = bitcast float %temp8.0 to i32
+  %29 = sitofp i32 %28 to float
+  %30 = fcmp ugt float %2, 0.000000e+00
+  %31 = select i1 %30, float 1.000000e+00, float %2
+  %32 = fcmp uge float %31, 0.000000e+00
+  %33 = select i1 %32, float %31, float -1.000000e+00
+  %34 = fadd float %33, 1.000000e+00
+  %35 = fmul float %34, 5.000000e-01
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = bitcast float %37 to i32
+  %39 = icmp eq i32 %38, 1
+  %40 = sext i1 %39 to i32
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp ne i32 %42, 0
+  br i1 %43, label %IF23, label %ENDIF
+
+ENDIF:                                            ; preds = %IF23, %ELSE, %IF
+  %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %44 = insertelement <4 x float> undef, float %temp4.0, i32 0
+  %45 = insertelement <4 x float> %44, float %temp5.0, i32 1
+  %46 = insertelement <4 x float> %45, float %temp6.0, i32 2
+  %47 = insertelement <4 x float> %46, float %temp7.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
+  ret void
+
+IF23:                                             ; preds = %ELSE
+  %48 = fcmp ult float 0.000000e+00, %2
+  %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00
+  %50 = fsub float -0.000000e+00, %49
+  %51 = fptosi float %50 to i32
+  %52 = bitcast i32 %51 to float
+  %53 = bitcast float %52 to i32
+  %54 = icmp ne i32 %53, 0
+  %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
+  %55 = bitcast float %.28 to i32
+  %56 = sitofp i32 %55 to float
+  %57 = load <4 x float>, <4 x float> addrspace(8)* null
+  %58 = extractelement <4 x float> %57, i32 0
+  %59 = fsub float -0.000000e+00, %58
+  %60 = fadd float %2, %59
+  br label %ENDIF
+}
+
+declare float @fabs(float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readonly }
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
new file mode 100644
index 000000000000..94c653c8f25b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -0,0 +1,46 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  %4 = sext i1 %3 to i32
+  %5 = bitcast i32 %4 to float
+  %6 = bitcast float %5 to i32
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = bitcast float %9 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF13, label %ENDIF
+
+ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
+  %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
+  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
+  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  ret void
+
+IF13:                                             ; preds = %ELSE
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fadd float 1.000000e+03, %22
+  br label %ENDIF
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
new file mode 100644
index 000000000000..6b3e0814c380
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s
+
+; FUNC-LABEL: {{^}}cluster_arg_loads:
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; VI-NEXT: s_nop 0
+; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
+define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+  store i32 %x, i32 addrspace(1)* %out0, align 4
+  store i32 %y, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when
+; s_load_dwordx2 has a register offset
+
+; FUNC-LABEL: @same_base_ptr_crash
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_load_dwordx2
+; GCN: s_endpgm
+define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+    i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
+    i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
+    i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
+    i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31,
+    i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39,
+    i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47,
+    i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55,
+    i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63,
+    i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71,
+    i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79,
+    i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87,
+    i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95,
+    i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103,
+    i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111,
+    i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
+    i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
+entry:
+  %value = add i64 %arg125, %arg126
+  store i64 %value, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
new file mode 100644
index 000000000000..3863afda5dd3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -0,0 +1,163 @@
+; XFAIL: *
+; REQUIRES: asserts
+; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
+
+declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+
+
+; SI-LABEL: {{^}}main(
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 2
+  %2 = fcmp ult float %0, 0.000000e+00
+  %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00
+  %4 = fsub float -0.000000e+00, %3
+  %5 = fptosi float %4 to i32
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = icmp ne i32 %7, 0
+  br i1 %8, label %LOOP, label %ENDIF
+
+Flow1:                                            ; preds = %ENDIF19, %ENDIF16
+  %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ]
+  %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ]
+  br label %Flow
+
+Flow2:                                            ; preds = %Flow
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %Flow2
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
+  %15 = extractelement <4 x float> %reg1, i32 1
+  %16 = extractelement <4 x float> %reg1, i32 3
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float %18, %0
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %0
+  %23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %24 = extractelement <4 x float> %23, i32 2
+  %25 = fmul float %24, %0
+  %26 = load <4 x float>, <4 x float> addrspace(9)* null
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = fmul float %27, %0
+  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float %30, %15
+  %32 = fadd float %31, %19
+  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %34 = extractelement <4 x float> %33, i32 1
+  %35 = fmul float %34, %15
+  %36 = fadd float %35, %22
+  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %38 = extractelement <4 x float> %37, i32 2
+  %39 = fmul float %38, %15
+  %40 = fadd float %39, %25
+  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %42 = extractelement <4 x float> %41, i32 3
+  %43 = fmul float %42, %15
+  %44 = fadd float %43, %28
+  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %46 = extractelement <4 x float> %45, i32 0
+  %47 = fmul float %46, %1
+  %48 = fadd float %47, %32
+  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %50 = extractelement <4 x float> %49, i32 1
+  %51 = fmul float %50, %1
+  %52 = fadd float %51, %36
+  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %54 = extractelement <4 x float> %53, i32 2
+  %55 = fmul float %54, %1
+  %56 = fadd float %55, %40
+  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %58 = extractelement <4 x float> %57, i32 3
+  %59 = fmul float %58, %1
+  %60 = fadd float %59, %44
+  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %62 = extractelement <4 x float> %61, i32 0
+  %63 = fmul float %62, %16
+  %64 = fadd float %63, %48
+  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %66 = extractelement <4 x float> %65, i32 1
+  %67 = fmul float %66, %16
+  %68 = fadd float %67, %52
+  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %70 = extractelement <4 x float> %69, i32 2
+  %71 = fmul float %70, %16
+  %72 = fadd float %71, %56
+  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %74 = extractelement <4 x float> %73, i32 3
+  %75 = fmul float %74, %16
+  %76 = fadd float %75, %60
+  %77 = insertelement <4 x float> undef, float %64, i32 0
+  %78 = insertelement <4 x float> %77, float %68, i32 1
+  %79 = insertelement <4 x float> %78, float %72, i32 2
+  %80 = insertelement <4 x float> %79, float %76, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  %81 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
+  %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
+  %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
+  call void @llvm.AMDGPU.barrier.local()
+  ret void
+
+LOOP:                                             ; preds = %main_body, %Flow
+  %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ]
+  %85 = fcmp uge float %temp4.0, %0
+  %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00
+  %87 = fsub float -0.000000e+00, %86
+  %88 = fptosi float %87 to i32
+  %89 = bitcast i32 %88 to float
+  %90 = bitcast float %89 to i32
+  %91 = icmp ne i32 %90, 0
+  %92 = xor i1 %91, true
+  br i1 %92, label %ENDIF16, label %Flow
+
+ENDIF16:                                          ; preds = %LOOP
+  %93 = fcmp une float %1, %temp4.0
+  %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00
+  %95 = fsub float -0.000000e+00, %94
+  %96 = fptosi float %95 to i32
+  %97 = bitcast i32 %96 to float
+  %98 = bitcast float %97 to i32
+  %99 = icmp ne i32 %98, 0
+  %100 = xor i1 %99, true
+  br i1 %100, label %ENDIF19, label %Flow1
+
+Flow:                                             ; preds = %Flow1, %LOOP
+  %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ]
+  %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ]
+  %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ]
+  %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ]
+  %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ]
+  %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ]
+  %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ]
+  %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ]
+  %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ]
+  %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ]
+  br i1 %110, label %Flow2, label %LOOP
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %111 = fadd float %temp.1, 1.000000e+00
+  %112 = fadd float %temp1.1, 0.000000e+00
+  %113 = fadd float %temp2.1, 0.000000e+00
+  %114 = fadd float %temp3.1, 0.000000e+00
+  %115 = fadd float %temp4.0, 1.000000e+00
+  br label %Flow1
+}
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
new file mode 100644
index 000000000000..8d980dbf8995
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll
@@ -0,0 +1,132 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = fcmp ult float %0, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %LOOP, label %ENDIF
+
+ENDIF:                                            ; preds = %ENDIF16, %LOOP, %main_body
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
+  %11 = load <4 x float>, <4 x float> addrspace(9)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %12, %0
+  %14 = load <4 x float>, <4 x float> addrspace(9)* null
+  %15 = extractelement <4 x float> %14, i32 1
+  %16 = fmul float %15, %0
+  %17 = load <4 x float>, <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 2
+  %19 = fmul float %18, %0
+  %20 = load <4 x float>, <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 3
+  %22 = fmul float %21, %0
+  %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float %24, %1
+  %26 = fadd float %25, %13
+  %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %28 = extractelement <4 x float> %27, i32 1
+  %29 = fmul float %28, %1
+  %30 = fadd float %29, %16
+  %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %32 = extractelement <4 x float> %31, i32 2
+  %33 = fmul float %32, %1
+  %34 = fadd float %33, %19
+  %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %36 = extractelement <4 x float> %35, i32 3
+  %37 = fmul float %36, %1
+  %38 = fadd float %37, %22
+  %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %40 = extractelement <4 x float> %39, i32 0
+  %41 = fmul float %40, %2
+  %42 = fadd float %41, %26
+  %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %44 = extractelement <4 x float> %43, i32 1
+  %45 = fmul float %44, %2
+  %46 = fadd float %45, %30
+  %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %48 = extractelement <4 x float> %47, i32 2
+  %49 = fmul float %48, %2
+  %50 = fadd float %49, %34
+  %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %52 = extractelement <4 x float> %51, i32 3
+  %53 = fmul float %52, %2
+  %54 = fadd float %53, %38
+  %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %56 = extractelement <4 x float> %55, i32 0
+  %57 = fmul float %56, %3
+  %58 = fadd float %57, %42
+  %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %60 = extractelement <4 x float> %59, i32 1
+  %61 = fmul float %60, %3
+  %62 = fadd float %61, %46
+  %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %64 = extractelement <4 x float> %63, i32 2
+  %65 = fmul float %64, %3
+  %66 = fadd float %65, %50
+  %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %68 = extractelement <4 x float> %67, i32 3
+  %69 = fmul float %68, %3
+  %70 = fadd float %69, %54
+  %71 = insertelement <4 x float> undef, float %58, i32 0
+  %72 = insertelement <4 x float> %71, float %62, i32 1
+  %73 = insertelement <4 x float> %72, float %66, i32 2
+  %74 = insertelement <4 x float> %73, float %70, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
+  %75 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %76 = insertelement <4 x float> %75, float %temp1.0, i32 1
+  %77 = insertelement <4 x float> %76, float %temp2.0, i32 2
+  %78 = insertelement <4 x float> %77, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
+  ret void
+
+LOOP:                                             ; preds = %main_body, %ENDIF19
+  %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ]
+  %79 = fcmp uge float %temp4.0, %0
+  %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00
+  %81 = fsub float -0.000000e+00, %80
+  %82 = fptosi float %81 to i32
+  %83 = bitcast i32 %82 to float
+  %84 = bitcast float %83 to i32
+  %85 = icmp ne i32 %84, 0
+  br i1 %85, label %ENDIF, label %ENDIF16
+
+ENDIF16:                                          ; preds = %LOOP
+  %86 = fcmp une float %2, %temp4.0
+  %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00
+  %88 = fsub float -0.000000e+00, %87
+  %89 = fptosi float %88 to i32
+  %90 = bitcast i32 %89 to float
+  %91 = bitcast float %90 to i32
+  %92 = icmp ne i32 %91, 0
+  br i1 %92, label %ENDIF, label %ENDIF19
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %93 = fadd float %temp.1, 1.000000e+00
+  %94 = fadd float %temp1.1, 0.000000e+00
+  %95 = fadd float %temp2.1, 0.000000e+00
+  %96 = fadd float %temp3.1, 0.000000e+00
+  %97 = fadd float %temp4.0, 1.000000e+00
+  br label %LOOP
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
new file mode 100644
index 000000000000..56088718ada8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; When a frame index offset is more than 12-bits, make sure we don't store
+; it in mubuf's offset field.
+
+; Also, make sure we use the same register for storing the scratch buffer addresss
+; for both stores. This register is allocated by the register scavenger, so we
+; should be able to reuse the same regiser for each scratch buffer access.
+
+; CHECK-LABEL: {{^}}legal_offset_fi:
+; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}}
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0
+  store i32 1, i32* %scratchptr0
+
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0
+  store i32 2, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+
+  ret void
+
+}
+
+; CHECK-LABEL: {{^}}legal_offset_fi_offset
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen
+; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
+
+define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+entry:
+  %scratch0 = alloca [8192 x i32]
+  %scratch1 = alloca [8192 x i32]
+
+  %offset0 = load i32, i32 addrspace(1)* %offsets
+  %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0
+  store i32 %offset0, i32* %scratchptr0
+
+  %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1
+  %offset1 = load i32, i32 addrspace(1)* %offsetptr1
+  %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1
+  store i32 %offset1, i32* %scratchptr1
+
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if, label %else
+
+if:
+  %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset
+  %if_value = load i32, i32* %if_ptr
+  br label %done
+
+else:
+  %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset
+  %else_value = load i32, i32* %else_ptr
+  br label %done
+
+done:
+  %value = phi i32 [%if_value, %if], [%else_value, %else]
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
new file mode 100644
index 000000000000..de645353a401
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; The code generated by sdiv is long and complex and may frequently change.
+; The goal of this test is to make sure the ISel doesn't fail.
+;
+; This program was previously failing to compile when one of the selectcc
+; opcodes generated by the sdiv lowering was being legalized and optimized to:
+; selectcc Remainder -1, 0, -1, SETGT
+; This was fixed by adding an additional pattern in R600Instructions.td to
+; match this pattern with a CNDGE_INT.
+
+; FUNC-LABEL: {{^}}sdiv_i32:
+; EG: CF_END
+define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
+  %result = sdiv i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv_i32_4:
+define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply by a weird constant to make sure setIntDivIsCheap is
+; working.
+
+; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
+; SI: v_add_i32
+; SI: v_lshrrev_b32
+; SI: v_ashrrev_i32
+; SI: v_add_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = sdiv i32 %num, 3435
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+  %result = sdiv <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Tests for 64-bit divide bypass.
+; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = sdiv i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %result = srem i64 %a, %b
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+;   %resultdiv = sdiv i64 %a, %b
+;   %resultrem = srem i64 %a, %b
+;   %result = add i64 %resultdiv, %resultrem
+;   store i64 %result, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
new file mode 100644
index 000000000000..ad5df39f5505
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -0,0 +1,239 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sdiv24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = sdiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = sdiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = srem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = srem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
new file mode 100644
index 000000000000..a9b2b7f9df55
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -0,0 +1,225 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_sdiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN: v_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = sdiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 33
+  %2 = ashr i64 %y, 33
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_sdiv2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = sdiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_srem2464:
+;EG: INT_TO_FLT
+;EG: INT_TO_FLT
+;EG: FLT_TO_INT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = ashr i64 %x, 40
+  %2 = ashr i64 %y, 40
+  %result = srem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
new file mode 100644
index 000000000000..6735394e93a9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
+
+; FUNC-LABEL: {{^}}select_i1:
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
+define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i1 %a, i1 %b
+  store i1 %sel, i1 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
new file mode 100644
index 000000000000..59082c65cc8a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -0,0 +1,156 @@
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test expansion of scalar selects on vectors.
+; Evergreen not enabled since it seems to be having problems with doubles.
+
+
+; FUNC-LABEL: {{^}}select_v4i8:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+  %cmp = icmp eq i8 %c, 0
+  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
+  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4i16:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
+  store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx2
+define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
+  store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx4
+define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
+  store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
+  store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2f32:
+; SI: buffer_store_dwordx2
+define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4f32:
+; SI: buffer_store_dwordx4
+define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8f32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
+  store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v2f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
+  store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v4f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
+  store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}select_v8f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
+  store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll
new file mode 100644
index 000000000000..45f3cd5a7ac5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; Normally icmp + select is optimized to select_cc, when this happens the
+; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
+;
+; In order to avoid the select_cc optimization, this test case calculates the
+; condition for the select in a separate basic block.
+
+; FUNC-LABEL: {{^}}select:
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
+                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+                     i32 %cond) {
+entry:
+  br label %for
+body:
+  %inc = add i32 %i, 1
+  %br_cmp.i = icmp eq i1 %br_cmp, 0
+  br label %for
+for:
+  %i = phi i32 [ %inc, %body], [ 0, %entry ]
+  %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
+  %0 = icmp eq i32 %cond, %i
+  %1 = select i1 %br_cmp, i32 2, i32 3
+  %2 = select i1 %br_cmp, float 2.0 , float 5.0
+  %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
+  %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
+  %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
+  br i1 %0, label %body, label %done
+
+done:
+  store i32 %1, i32 addrspace(1)* %i32out
+  store float %2, float addrspace(1)* %f32out
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
+  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
+  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
new file mode 100644
index 000000000000..5cebb30dc72e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: {{^}}select0:
+; i64 select should be split into two i32 selects, and we shouldn't need
+; to use a shfit to extract the hi dword of the input.
+; CHECK-NOT: s_lshr_b64
+; CHECK: v_cndmask
+; CHECK: v_cndmask
+define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+entry:
+  %0 = icmp ugt i32 %cond, 5
+  %1 = select i1 %0, i64 0, i64 %in
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 0, i64 %in
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_i64_split_imm:
+; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63
+; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0
+; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]]
+; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]]
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}}
+; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}}
+; CHECK: s_endpgm
+define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64, i64 addrspace(1)* %aptr, align 8
+  %b = load i64, i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32
+  store i64 %sel, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll
new file mode 100644
index 000000000000..94d0ace75697
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE
+;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
+;CHECK: 1073741824
+define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %1 = load float, float addrspace(1)* %in
+  %2 = fcmp oeq float %1, 0.0
+  %3 = select i1 %2, float 1.0, float 2.0
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
new file mode 100644
index 000000000000..58a4ee7d62b2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE_INT
+;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
+;CHECK-NEXT: 2
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %1 = load i32, i32 addrspace(1)* %in
+  %2 = icmp eq i32 %1, 0
+  %3 = select i1 %2, i32 1, i32 2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
new file mode 100644
index 000000000000..e870ee891e66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Note additional optimizations may cause this SGT to be replaced with a
+; CND* instruction.
+; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x,
+; CHECK-NEXT: -1
+; Test a selectcc with i32 LHS/RHS and float True/False
+
+define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  %1 = icmp sge i32 %0, 0
+  %2 = select i1 %1, float 1.0, float 0.0
+  store float %2, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll
new file mode 100644
index 000000000000..65be4a626a18
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}test_a:
+; EG-NOT: CND
+; EG: SET{{[NEQGTL]+}}_DX10
+
+define void @test_a(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 0.000000e+00
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %IF, label %ENDIF
+
+IF:
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Same as test_a, but the branch labels are swapped to produce the inverse cc
+; for the icmp instruction
+
+; EG-LABEL: {{^}}test_b:
+; EG: SET{{[GTEQN]+}}_DX10
+; EG-NEXT: PRED_
+; EG-NEXT: ALU clause starting
+define void @test_b(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 0.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %ENDIF, label %IF
+
+IF:
+  %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test a CND*_INT instruction with float true/false values
+; EG-LABEL: {{^}}test_c:
+; EG: CND{{[GTE]+}}_INT
+define void @test_c(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  %1 = select i1 %0, float 2.0, float 3.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}selectcc_bool:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NOT: cmp
+; SI-NOT: cndmask
+define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = select i1 %icmp0, i32 -1, i32 0
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll
new file mode 100644
index 000000000000..f378e15dd763
--- /dev/null
+++ b/test/CodeGen/AMDGPU/selectcc.ll
@@ -0,0 +1,20 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}selectcc_i64:
+; EG: XOR_INT
+; EG: XOR_INT
+; EG: OR_INT
+; EG: CNDE_INT
+; EG: CNDE_INT
+; SI: v_cmp_eq_i64
+; SI: v_cndmask
+; SI: v_cndmask
+define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+entry:
+  %0 = icmp eq i64 %lhs, %rhs
+  %1 = select i1 %0, i64 %true, i64 %false
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
new file mode 100644
index 000000000000..53694dcffa66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -0,0 +1,161 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests check that floating point comparisons which are used by select
+; to store integer true (-1) and false (0) values are lowered to one of the
+; SET*DX10 instructions.
+
+; CHECK: {{^}}fcmp_une_select_fptosi:
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_une_select_i32:
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oeq_select_fptosi:
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oeq_select_i32:
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ogt_select_fptosi:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ogt float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ogt_select_i32:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ogt float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oge_select_fptosi:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oge float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_oge_select_i32:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oge float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ole_select_fptosi:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_ole_select_i32:
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_olt_select_fptosi:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}fcmp_olt_select_i32:
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll
new file mode 100644
index 000000000000..11ea793650c4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: {{^}}and_setcc_setcc_i32:
+; EG: AND_INT
+; EG-NEXT: SETE_INT
+define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %cmp1 = icmp eq i32 %a, -1
+  %cmp2 = icmp eq i32 %b, -1
+  %and = and i1 %cmp1, %cmp2
+  %ext = sext i1 %and to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: {{^}}and_setcc_setcc_v4i32:
+; EG: AND_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+; EG: AND_INT
+; EG: SETE_INT
+define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i1> %cmp1, %cmp2
+  %ext = sext <4 x i1> %and to <4 x i32>
+  store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
new file mode 100644
index 000000000000..4e6a10d6b78d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -0,0 +1,236 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT:buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+
+; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
+; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
+define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; This really folds away to false
+; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; This really folds away to true
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc
+; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1,
+; GCN-NEXT: buffer_store_byte [[TMP]]
+; GCN-NEXT: s_endpgm
+define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_ne_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp eq i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
+; GCN-NOT: v_cmp
+; GCN: v_cmp_eq_i32_e32 vcc,
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 1
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}}
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: buffer_store_byte
+; GCN: s_endpgm
+define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
+; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
+; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, 255
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1:
+; GCN: buffer_load_sbyte [[B:v[0-9]+]]
+; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+  %b = load i8, i8 addrspace(1)* %b.ptr
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg:
+; GCN: s_load_dword [[B:s[0-9]+]]
+; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]]
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This ends up doing a buffer_load_ubyte, and and compare to
+; 255. Seems to be because of ordering problems when not allowing load widths to be reduced.
+; Should do a buffer_load_sbyte and compare with -1
+
+; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg:
+; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}}
+; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}}
+; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = sext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}cmp_zext_k_neg1:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN: s_endpgm
+define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+  %b.ext = zext i8 %b to i32
+  %icmp0 = icmp ne i32 %b.ext, -1
+  store i1 %icmp0, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k:
+; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_byte [[RESULT]]
+; GCN-NEXT: s_endpgm
+define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = zext i1 %icmp0 to i32
+  %icmp1 = icmp eq i32 %ext, 2
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
new file mode 100644
index 000000000000..f33a82df5ffb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -0,0 +1,377 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: {{^}}setcc_v2i32:
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = icmp eq <2 x i32> %a, %b
+  %sext = sext <2 x i1> %result to <2 x i32>
+  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}setcc_v4i32:
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %result to <4 x i32>
+  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; Float comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}f32_oeq:
+; R600: SETE_DX10
+; SI: v_cmp_eq_f32
+define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oeq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ogt:
+; R600: SETGT_DX10
+; SI: v_cmp_gt_f32
+define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ogt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_oge:
+; R600: SETGE_DX10
+; SI: v_cmp_ge_f32
+define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_olt:
+; R600: SETGT_DX10
+; SI: v_cmp_lt_f32
+define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp olt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ole:
+; R600: SETGE_DX10
+; SI: v_cmp_le_f32
+define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ole float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_one:
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+
+; SI: v_cmp_lg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp one float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ord:
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: v_cmp_o_f32
+define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ord float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ueq:
+; R600-DAG: SETNE_DX10
+; R600-DAG: SETNE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETNE_INT
+
+; SI: v_cmp_nlg_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ueq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ugt:
+; R600: SETGE
+; R600: SETE_DX10
+; SI: v_cmp_nle_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ugt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_uge:
+; R600: SETGT
+; R600: SETE_DX10
+
+; SI: v_cmp_nlt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ult:
+; R600: SETGE
+; R600: SETE_DX10
+
+; SI: v_cmp_nge_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ult float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_ule:
+; R600: SETGT
+; R600: SETE_DX10
+
+; SI: v_cmp_ngt_f32_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ule float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_une:
+; R600: SETNE_DX10
+; SI: v_cmp_neq_f32
+define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp une float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f32_uno:
+; R600: SETNE_DX10
+; R600: SETNE_DX10
+; R600: OR_INT
+; R600: SETNE_INT
+; SI: v_cmp_u_f32
+define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uno float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 32-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}i32_eq:
+; R600: SETE_INT
+; SI: v_cmp_eq_i32
+define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ne:
+; R600: SETNE_INT
+; SI: v_cmp_ne_i32
+define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ne i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ugt:
+; R600: SETGT_UINT
+; SI: v_cmp_gt_u32
+define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ugt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_uge:
+; R600: SETGE_UINT
+; SI: v_cmp_ge_u32
+define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp uge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ult:
+; R600: SETGT_UINT
+; SI: v_cmp_lt_u32
+define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ult i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_ule:
+; R600: SETGE_UINT
+; SI: v_cmp_le_u32
+define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ule i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sgt:
+; R600: SETGT_INT
+; SI: v_cmp_gt_i32
+define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sgt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sge:
+; R600: SETGE_INT
+; SI: v_cmp_ge_i32
+define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_slt:
+; R600: SETGT_INT
+; SI: v_cmp_lt_i32
+define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp slt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i32_sle:
+; R600: SETGE_INT
+; SI: v_cmp_le_i32
+define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This does 4 compares
+; FUNC-LABEL: {{^}}v3i32_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i32> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i32>
+  store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_eq:
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI-DAG: v_cmp_eq_i32
+; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
+; SI: s_endpgm
+define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
+  %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
+  %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a
+  %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b
+  %cmp = icmp eq <3 x i8> %a, %b
+  %ext = sext <3 x i1> %cmp to <3 x i8>
+  store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
new file mode 100644
index 000000000000..231be7aa3da7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -0,0 +1,259 @@
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; XXX: Merge this into setcc, once R600 supports 64-bit operations
+
+;;;==========================================================================;;;
+;; Double comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}f64_oeq:
+; SI: v_cmp_eq_f64
+define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oeq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ogt:
+; SI: v_cmp_gt_f64
+define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ogt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_oge:
+; SI: v_cmp_ge_f64
+define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_olt:
+; SI: v_cmp_lt_f64
+define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp olt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ole:
+; SI: v_cmp_le_f64
+define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ole double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_one:
+; SI: v_cmp_lg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp one double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ord:
+; SI: v_cmp_o_f64
+define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ord double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ueq:
+; SI: v_cmp_nlg_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ueq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ugt:
+
+; SI: v_cmp_nle_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ugt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_uge:
+; SI: v_cmp_nlt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ult:
+; SI: v_cmp_nge_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ult double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_ule:
+; SI: v_cmp_ngt_f64_e32 vcc
+; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ule double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_une:
+; SI: v_cmp_neq_f64
+define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp une double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}f64_uno:
+; SI: v_cmp_u_f64
+define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uno double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 64-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: {{^}}i64_eq:
+; SI: v_cmp_eq_i64
+define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ne:
+; SI: v_cmp_ne_i64
+define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ne i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ugt:
+; SI: v_cmp_gt_u64
+define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ugt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_uge:
+; SI: v_cmp_ge_u64
+define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp uge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ult:
+; SI: v_cmp_lt_u64
+define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ult i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_ule:
+; SI: v_cmp_le_u64
+define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ule i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sgt:
+; SI: v_cmp_gt_i64
+define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sgt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sge:
+; SI: v_cmp_ge_i64
+define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_slt:
+; SI: v_cmp_lt_i64
+define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp slt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i64_sle:
+; SI: v_cmp_le_i64
+define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sle i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll
new file mode 100644
index 000000000000..9b5d6b5dbd62
--- /dev/null
+++ b/test/CodeGen/AMDGPU/seto.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
+define void @main(float %p) {
+main_body:
+  %c = fcmp oeq float %p, %p
+  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll
new file mode 100644
index 000000000000..76346c4f624a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setuo.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
+define void @main(float %p) {
+main_body:
+  %c = fcmp une float %p, %p
+  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll
new file mode 100644
index 000000000000..7dc6eb87f6b5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: SUB_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = add i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = sub i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
new file mode 100644
index 000000000000..5aedda2ce1a9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -0,0 +1,611 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
+; SI: buffer_store_dword [[EXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
+  %shl = shl i32 %in, 31
+  %sext = ashr i32 %shl, 31
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %shl = shl i32 %c, 16
+  %ashr = ashr i32 %shl, 16
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
+define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %c = add <1 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <1 x i32> %c, <i32 24>
+  %ashr = ashr <1 x i32> %shl, <i32 24>
+  store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE_INT
+
+; EG: ASHR [[RES_HI]]
+
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
+define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
+; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
+; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
+; XSI: buffer_store_dword
+; XEG: BFE_INT
+; XEG: ASHR
+; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
+;   %c = add <1 x i64> %a, %b
+;   %shl = shl <1 x i64> %c, <i64 56>
+;   %ashr = ashr <1 x i64> %shl, <i64 56>
+;   store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a.gep, align 8
+  %b = load i64, i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b
+  %x = shl i32 %c, 6
+  %y = ashr i32 %x, 7
+  store i32 %y, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI-NOT: s_lshl
+; SI-NOT: s_ashr
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
+; SI: s_endpgm
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b
+  %x = shl <2 x i32> %c, <i32 6, i32 6>
+  %y = ashr <2 x i32> %x, <i32 7, i32 7>
+  store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 31, i32 31>
+  %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx4
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
+  %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 24, i32 24>
+  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx4
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
+define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i32> %c, <i32 16, i32 16>
+  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
+  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}testcase:
+define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}testcase_3:
+define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
+  %and_a_1 = and i8 %a, 1
+  %cmp_eq = icmp eq i8 %and_a_1, 0
+  %cmp_slt = icmp slt i8 %a, 0
+  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
+  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
+  %xor = xor i8 %sel0, %sel1
+  store i8 %xor, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
+; SI: buffer_load_sbyte
+; SI: v_max_i32
+; SI-NOT: bfe
+; SI: buffer_store_short
+define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
+  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
+  %tmp2 = sext i8 %tmp5 to i32
+  %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone
+  %tmp4 = trunc i32 %tmp3 to i8
+  %tmp6 = sext i8 %tmp4 to i16
+  store i16 %tmp6, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}bfe_0_width:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_8_bfe_8:
+; SI: v_bfe_i32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}bfe_8_bfe_16:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: s_endpgm
+define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; FUNC-LABEL: {{^}}bfe_16_bfe_8:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
+define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; SI: buffer_load_sbyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI: .text
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
+define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: shr
+; SI-NOT: shl
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
+define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: shl
+; SI-NOT: shr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: s_endpgm
+define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: v_lshl
+; SI-NOT: v_ashr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: s_endpgm
+define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
new file mode 100644
index 000000000000..38289ced632a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+;
+;
+; Most SALU instructions ignore control flow, so we need to make sure
+; they don't overwrite values from other blocks.
+
+; If the branch decision is made based on a value in an SGPR then all
+; threads will execute the same code paths, so we don't need to worry
+; about instructions in different blocks overwriting each other.
+; SI-LABEL: {{^}}sgpr_if_else_salu_br:
+; SI: s_add
+; SI: s_add
+
+define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = add i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; The two S_ADD instructions should write to different registers, since
+; different threads will take different control flow paths.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_br:
+; SI: s_add_i32 [[SGPR:s[0-9]+]]
+; SI-NOT: s_add_i32 [[SGPR]]
+
+define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid_f = uitofp i32 %tid to float
+  %tmp1 = fcmp ueq float %tid_f, 0.0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %tmp2 = add i32 %b, %c
+  br label %endif
+
+else:
+  %tmp3 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should write to different SGPR pairs instead of copying to
+; VALU for i1 phi.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
+; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
+
+; SI: BB2_1:
+; SI: buffer_load_dword [[AVAL:v[0-9]+]]
+; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
+
+; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
+; SI: buffer_store_dword [[RESULT]]
+define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tmp1 = icmp eq i32 %tid, 0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
+  %a.val = load i32, i32 addrspace(1)* %gep.if
+  %cmp.if = icmp eq i32 %a.val, 0
+  br label %endif
+
+else:
+  %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid
+  %b.val = load i32, i32 addrspace(1)* %gep.else
+  %cmp.else = icmp slt i32 %b.val, 0
+  br label %endif
+
+endif:
+  %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
+  %ext = sext i1 %tmp4 to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
new file mode 100644
index 000000000000..df67fcca22fe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; Copy VGPR -> SGPR used twice as an instruction operand, which is then
+; used in an REG_SEQUENCE that also needs to be handled.
+
+; SI-LABEL: {{^}}test_dup_operands:
+; SI: v_add_i32_e32
+define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %lo = extractelement <2 x i32> %a, i32 0
+  %hi = extractelement <2 x i32> %a, i32 1
+  %add = add i32 %lo, %lo
+  %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
+  %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
+  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
new file mode 100644
index 000000000000..b849c4038bc7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -0,0 +1,379 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; This test checks that no VGPR to SGPR copies are created by the register
+; allocator.
+; CHECK-LABEL: {{^}}phi1:
+; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
+; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
+
+define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %25 = fptosi float %23 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %26, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %27 = fsub float -0.000000e+00, %22
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %ELSE
+  %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
+  %28 = fadd float %temp.0, %24
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
+  ret void
+}
+
+; Make sure this program doesn't crash
+; CHECK-LABEL: {{^}}phi2:
+define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
+  %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1
+  %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1
+  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
+  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
+  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
+  %46 = bitcast float %41 to i32
+  %47 = bitcast float %42 to i32
+  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
+  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
+  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = call float @fabs(float %51)
+  %53 = fmul float %43, %43
+  %54 = fmul float %44, %44
+  %55 = fadd float %54, %53
+  %56 = fmul float %45, %45
+  %57 = fadd float %55, %56
+  %58 = call float @llvm.AMDGPU.rsq.f32(float %57)
+  %59 = fmul float %43, %58
+  %60 = fmul float %44, %58
+  %61 = fmul float %45, %58
+  %62 = fmul float %59, %23
+  %63 = fmul float %60, %24
+  %64 = fadd float %63, %62
+  %65 = fmul float %61, %25
+  %66 = fadd float %64, %65
+  %67 = fsub float -0.000000e+00, %26
+  %68 = fmul float %66, %52
+  %69 = fadd float %68, %67
+  %70 = fmul float %27, %69
+  %71 = fmul float %28, %69
+  %72 = call float @fabs(float %70)
+  %73 = fcmp olt float 0x3EE4F8B580000000, %72
+  %74 = sext i1 %73 to i32
+  %75 = bitcast i32 %74 to float
+  %76 = bitcast float %75 to i32
+  %77 = icmp ne i32 %76, 0
+  br i1 %77, label %IF, label %ENDIF
+
+IF:                                               ; preds = %main_body
+  %78 = fsub float -0.000000e+00, %70
+  %79 = call float @llvm.AMDIL.exp.(float %78)
+  %80 = fsub float -0.000000e+00, %79
+  %81 = fadd float 1.000000e+00, %80
+  %82 = fdiv float 1.000000e+00, %70
+  %83 = fmul float %81, %82
+  %84 = fmul float %32, %83
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %IF
+  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
+  %85 = call float @fabs(float %71)
+  %86 = fcmp olt float 0x3EE4F8B580000000, %85
+  %87 = sext i1 %86 to i32
+  %88 = bitcast i32 %87 to float
+  %89 = bitcast float %88 to i32
+  %90 = icmp ne i32 %89, 0
+  br i1 %90, label %IF25, label %ENDIF24
+
+IF25:                                             ; preds = %ENDIF
+  %91 = fsub float -0.000000e+00, %71
+  %92 = call float @llvm.AMDIL.exp.(float %91)
+  %93 = fsub float -0.000000e+00, %92
+  %94 = fadd float 1.000000e+00, %93
+  %95 = fdiv float 1.000000e+00, %71
+  %96 = fmul float %94, %95
+  %97 = fmul float %36, %96
+  br label %ENDIF24
+
+ENDIF24:                                          ; preds = %ENDIF, %IF25
+  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
+  %98 = fmul float %29, %temp4.0
+  %99 = fmul float %30, %temp4.0
+  %100 = fmul float %31, %temp4.0
+  %101 = fmul float %33, %temp8.0
+  %102 = fadd float %101, %98
+  %103 = fmul float %34, %temp8.0
+  %104 = fadd float %103, %99
+  %105 = fmul float %35, %temp8.0
+  %106 = fadd float %105, %100
+  %107 = call float @llvm.pow.f32(float %52, float %22)
+  %108 = fsub float -0.000000e+00, %102
+  %109 = fmul float %108, %107
+  %110 = fsub float -0.000000e+00, %104
+  %111 = fmul float %110, %107
+  %112 = fsub float -0.000000e+00, %106
+  %113 = fmul float %112, %107
+  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
+  %115 = bitcast i32 %114 to float
+  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
+  %117 = bitcast i32 %116 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+  ret void
+}
+
+; We just want ot make sure the program doesn't crash
+; CHECK-LABEL: {{^}}loop:
+
+define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
+  %26 = fptosi float %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
+  %29 = bitcast float %temp8.0 to i32
+  %30 = icmp sge i32 %29, %28
+  %31 = sext i1 %30 to i32
+  %32 = bitcast i32 %31 to float
+  %33 = bitcast float %32 to i32
+  %34 = icmp ne i32 %33, 0
+  br i1 %34, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %35 = bitcast float %temp8.0 to i32
+  %36 = add i32 %35, 1
+  %37 = bitcast i32 %36 to float
+  br label %LOOP
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readonly }
+attributes #3 = { readnone }
+attributes #4 = { nounwind readonly }
+
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #3
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; This checks for a bug in the FixSGPRCopies pass where VReg96
+; registers were being identified as an SGPR regclass which was causing
+; an assertion failure.
+
+; CHECK-LABEL: {{^}}sample_v3:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: exp
+; CHECK: s_endpgm
+define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+
+entry:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
+  %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2
+  %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2
+  %28 = fcmp oeq float %23, 0.0
+  br i1 %28, label %if, label %else
+
+if:
+  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.if.0 = extractelement <4 x float> %val.if, i32 0
+  %val.if.1 = extractelement <4 x float> %val.if, i32 1
+  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  br label %endif
+
+else:
+  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.else.0 = extractelement <4 x float> %val.else, i32 0
+  %val.else.1 = extractelement <4 x float> %val.else, i32 1
+  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  br label %endif
+
+endif:
+  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
+  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
+  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+  ret void
+}
+
+!2 = !{!"const", null, i32 1}
+
+; CHECK-LABEL: {{^}}copy1:
+; CHECK: buffer_load_dword
+; CHECK: v_add
+; CHECK: s_endpgm
+define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+entry:
+  %0 = load float, float addrspace(1)* %in0
+  %1 = fcmp oeq float %0, 0.0
+  br i1 %1, label %if0, label %endif
+
+if0:
+  %2 = bitcast float %0 to i32
+  %3 = fcmp olt float %0, 0.0
+  br i1 %3, label %if1, label %endif
+
+if1:
+  %4 = add i32 %2, 1
+  br label %endif
+
+endif:
+  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
+  %6 = bitcast i32 %5 to float
+  store float %6, float addrspace(1)* %out
+  ret void
+}
+
+; This test is just checking that we don't crash / assertion fail.
+; CHECK-LABEL: {{^}}copy2:
+; CHECK: s_endpgm
+
+define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+entry:
+  br label %LOOP68
+
+LOOP68:
+  %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
+  %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
+  %g = icmp eq i32 0, %t
+  %l = bitcast float %temp4.7 to i32
+  br i1 %g, label %IF70, label %ENDIF69
+
+IF70:
+  %q = icmp ne i32 %l, 13
+  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  ret void
+
+ENDIF69:
+  %u = add i32 %l, %t
+  %v = bitcast i32 %u to float
+  %x = add i32 %t, -1
+  br label %LOOP68
+}
+
+attributes #0 = { "ShaderType"="0" }
+
+; This test checks that image_sample resource descriptors aren't loaded into
+; vgprs.  The verifier will fail if this happens.
+; CHECK-LABEL:{{^}}sample_rsrc:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: s_endpgm
+define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+bb:
+  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
+  %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp31 = bitcast float %tmp23 to i32
+  %tmp36 = icmp ne i32 %tmp31, 0
+  br i1 %tmp36, label %bb38, label %bb80
+
+bb38:                                             ; preds = %bb
+  %tmp52 = bitcast float %tmp29 to i32
+  %tmp53 = bitcast float %tmp30 to i32
+  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
+  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
+  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+  br label %bb71
+
+bb80:                                             ; preds = %bb
+  %tmp81 = bitcast float %tmp29 to i32
+  %tmp82 = bitcast float %tmp30 to i32
+  %tmp82.2 = add i32 %tmp82, 1
+  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
+  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
+  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+  br label %bb71
+
+bb71:                                             ; preds = %bb80, %bb38
+  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp88 = extractelement <4 x float> %tmp72, i32 0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shared-op-cycle.ll b/test/CodeGen/AMDGPU/shared-op-cycle.ll
new file mode 100644
index 000000000000..f52a9baf4d18
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: {{^}}main:
+; CHECK: MULADD_IEEE *
+; CHECK-NOT: MULADD_IEEE *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+   %w0 = extractelement <4 x float> %reg0, i32 3
+   %w1 = extractelement <4 x float> %reg1, i32 3
+   %w2 = extractelement <4 x float> %reg2, i32 3
+   %sq0 = fmul float %w0, %w0
+   %r0 = fadd float %sq0, 2.0
+   %sq1 = fmul float %w1, %w1
+   %r1 = fadd float %sq1, 2.0
+   %sq2 = fmul float %w2, %w2
+   %r2 = fadd float %sq2, 2.0
+   %v0 = insertelement <4 x float> undef, float %r0, i32 0
+   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
+   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
+   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %vecres = insertelement <4 x float> undef, float %res, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+   ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
new file mode 100644
index 000000000000..53b63dc4b8ad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -0,0 +1,180 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+
+;EG: {{^}}shl_v2i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}shl_v2i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI: {{^}}shl_v2i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = shl <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v4i32:
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}shl_v4i32:
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI: {{^}}shl_v4i32:
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = shl <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_i64:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}}
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+
+;SI: {{^}}shl_i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
+  %result = shl i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v2i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v2i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+  %result = shl <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}shl_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHR {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: LSHR {{.*}}, 1
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL {{.*}}, [[SHA]]
+;EG-DAG: LSHL {{.*}}, [[SHB]]
+;EG-DAG: LSHL {{.*}}, [[SHC]]
+;EG-DAG: LSHL {{.*}}, [[SHD]]
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: LSHL
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT {{.*}}, 0.0
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI: {{^}}shl_v4i64:
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI: {{^}}shl_v4i64:
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+  %result = shl <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
new file mode 100644
index 000000000000..b1485bfaaebb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Test with inline immediate
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
+; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
+; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-DAG: buffer_store_dword [[ADDREG]]
+; SI-DAG: buffer_store_dword [[SHLREG]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out0, align 4
+  store i32 %add, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test with add literal constant
+
+; FUNC-LABEL: {{^}}shl_2_add_999_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  %shl = add i32 %val, 999
+  %result = shl i32 %shl, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+   store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+
+define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
new file mode 100644
index 000000000000..6671e909cd1d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -0,0 +1,284 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+; Test that doing a shift of a pointer with a constant add will be
+; folded into the constant offset addressing mode even if the add has
+; multiple uses. This is relevant to accessing 2 separate, adjacent
+; LDS globals.
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [512 x float] undef, align 4
+
+
+; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
+
+; SI-LABEL: {{^}}load_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
+; SI: s_endpgm
+define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure once the first use is folded into the addressing mode, the
+; remaining add use goes through the normal shl + add constant fold.
+
+; SI-LABEL: {{^}}load_shl_base_lds_1:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
+; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
+; SI-DAG: buffer_store_dword [[RESULT]]
+; SI-DAG: buffer_store_dword [[ADDUSE]]
+; SI: s_endpgm
+define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %shl_add_use = shl i32 %idx.0, 2
+  store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+@maxlds = addrspace(3) global [65536 x i8] undef, align 4
+
+; SI-LABEL: {{^}}load_shl_base_lds_max_offset
+; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
+; SI: s_endpgm
+define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 65535
+  %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
+  %val0 = load i8, i8 addrspace(3)* %arrayidx0
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i8 %val0, i8 addrspace(1)* %out
+  ret void
+}
+
+; The two globals are placed adjacent in memory, so the same base
+; pointer can be used with an offset into the second one.
+
+; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: s_mov_b32 m0, -1
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+; SI: s_endpgm
+define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  store float 1.0, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+
+; --------------------------------------------------------------------------------
+; Atomics.
+
+@lds2 = addrspace(3) global [512 x i32] undef, align 4
+
+; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+
+; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
new file mode 100644
index 000000000000..69d719385acd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll
@@ -0,0 +1,25 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
+
+
+define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+; CHECK-LABEL: {{^}}test:
+
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
new file mode 100644
index 000000000000..bbcb861f37dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
+
+; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
+; Lowered break instructin:
+; SI: s_or_b64
+; Lowered Loop instruction:
+; SI: s_andn2_b64
+; s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
+define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+main_body:
+  %0 = and i32 %a, %b
+  %1 = trunc i32 %0 to i1
+  br label %ENDIF
+
+ENDLOOP:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+
+ENDIF:
+  br i1 %1, label %ENDLOOP, label %ENDIF
+}
+
+
+; FUNC-LABEL: {{^}}phi_cond_outside_loop:
+; FIXME: This could be folded into the s_or_b64 instruction
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
+; SI: [[LOOP_LABEL:[A-Z0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+
+; SI_IF_BREAK instruction:
+; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+
+; SI_LOOP instruction:
+; SI: s_andn2_b64 exec, exec, [[BREAK]]
+; SI: s_cbranch_execnz [[LOOP_LABEL]]
+; SI: s_endpgm
+
+define void @phi_cond_outside_loop(i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a , 0
+  br i1 %0, label %if, label %else
+
+if:
+  br label %endif
+
+else:
+  %1 = icmp eq i32 %b, 0
+  br label %endif
+
+endif:
+  %2 = phi i1 [0, %if], [%1, %else]
+  br label %loop
+
+loop:
+  br i1 %2, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
new file mode 100644
index 000000000000..944499a11461
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -0,0 +1,52 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; This shader has the potential to generated illegal VGPR to SGPR copies if
+; the wrong register class is used for the REG_SEQUENCE instructions.
+
+; CHECK: {{^}}main:
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1
+  %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1
+  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %29 = bitcast float %22 to i32
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %28 to i32
+  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
+  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
+  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = extractelement <4 x float> %36, i32 1
+  %39 = extractelement <4 x float> %36, i32 2
+  %40 = extractelement <4 x float> %36, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null}
+!1 = !{!0, !0, i64 0, i32 1}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
new file mode 100644
index 000000000000..84652701f773
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -0,0 +1,1568 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
+
+; These tests check that the compiler won't crash when it needs to spill
+; SGPRs.
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_wqm
+; Writing to M0 from an SMRD instruction will hang the GPU.
+; CHECK-NOT: s_buffer_load_dword m0
+; CHECK: s_endpgm
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
+  %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0
+  %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0
+  %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0
+  %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0
+  %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0
+  %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0
+  %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0
+  %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0
+  %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0
+  %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0
+  %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0
+  %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0
+  %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0
+  %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0
+  %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0
+  %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0
+  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %110 = call i32 @llvm.SI.tid()
+  %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
+  %112 = bitcast float %93 to i32
+  store i32 %112, i32 addrspace(3)* %111
+  %113 = bitcast float %94 to i32
+  store i32 %113, i32 addrspace(3)* %111
+  %114 = call i32 @llvm.SI.tid()
+  %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
+  %116 = and i32 %114, -4
+  %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
+  %118 = add i32 %116, 1
+  %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
+  %120 = bitcast float %93 to i32
+  store i32 %120, i32 addrspace(3)* %115
+  %121 = load i32, i32 addrspace(3)* %117
+  %122 = bitcast i32 %121 to float
+  %123 = load i32, i32 addrspace(3)* %119
+  %124 = bitcast i32 %123 to float
+  %125 = fsub float %124, %122
+  %126 = bitcast float %94 to i32
+  store i32 %126, i32 addrspace(3)* %115
+  %127 = load i32, i32 addrspace(3)* %117
+  %128 = bitcast i32 %127 to float
+  %129 = load i32, i32 addrspace(3)* %119
+  %130 = bitcast i32 %129 to float
+  %131 = fsub float %130, %128
+  %132 = insertelement <4 x float> undef, float %125, i32 0
+  %133 = insertelement <4 x float> %132, float %131, i32 1
+  %134 = insertelement <4 x float> %133, float %131, i32 2
+  %135 = insertelement <4 x float> %134, float %131, i32 3
+  %136 = extractelement <4 x float> %135, i32 0
+  %137 = extractelement <4 x float> %135, i32 1
+  %138 = fmul float %60, %93
+  %139 = fmul float %60, %94
+  %140 = fmul float %60, %94
+  %141 = fmul float %60, %94
+  %142 = call i32 @llvm.SI.tid()
+  %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
+  %144 = bitcast float %138 to i32
+  store i32 %144, i32 addrspace(3)* %143
+  %145 = bitcast float %139 to i32
+  store i32 %145, i32 addrspace(3)* %143
+  %146 = bitcast float %140 to i32
+  store i32 %146, i32 addrspace(3)* %143
+  %147 = bitcast float %141 to i32
+  store i32 %147, i32 addrspace(3)* %143
+  %148 = call i32 @llvm.SI.tid()
+  %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
+  %150 = and i32 %148, -4
+  %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
+  %152 = add i32 %150, 2
+  %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
+  %154 = bitcast float %138 to i32
+  store i32 %154, i32 addrspace(3)* %149
+  %155 = load i32, i32 addrspace(3)* %151
+  %156 = bitcast i32 %155 to float
+  %157 = load i32, i32 addrspace(3)* %153
+  %158 = bitcast i32 %157 to float
+  %159 = fsub float %158, %156
+  %160 = bitcast float %139 to i32
+  store i32 %160, i32 addrspace(3)* %149
+  %161 = load i32, i32 addrspace(3)* %151
+  %162 = bitcast i32 %161 to float
+  %163 = load i32, i32 addrspace(3)* %153
+  %164 = bitcast i32 %163 to float
+  %165 = fsub float %164, %162
+  %166 = bitcast float %140 to i32
+  store i32 %166, i32 addrspace(3)* %149
+  %167 = load i32, i32 addrspace(3)* %151
+  %168 = bitcast i32 %167 to float
+  %169 = load i32, i32 addrspace(3)* %153
+  %170 = bitcast i32 %169 to float
+  %171 = fsub float %170, %168
+  %172 = bitcast float %141 to i32
+  store i32 %172, i32 addrspace(3)* %149
+  %173 = load i32, i32 addrspace(3)* %151
+  %174 = bitcast i32 %173 to float
+  %175 = load i32, i32 addrspace(3)* %153
+  %176 = bitcast i32 %175 to float
+  %177 = fsub float %176, %174
+  %178 = insertelement <4 x float> undef, float %159, i32 0
+  %179 = insertelement <4 x float> %178, float %165, i32 1
+  %180 = insertelement <4 x float> %179, float %171, i32 2
+  %181 = insertelement <4 x float> %180, float %177, i32 3
+  %182 = extractelement <4 x float> %181, i32 0
+  %183 = extractelement <4 x float> %181, i32 1
+  %184 = fdiv float 1.000000e+00, %97
+  %185 = fmul float %33, %184
+  %186 = fcmp uge float 1.000000e+00, %185
+  %187 = select i1 %186, float %185, float 1.000000e+00
+  %188 = fmul float %187, %30
+  %189 = call float @ceil(float %188)
+  %190 = fcmp uge float 3.000000e+00, %189
+  %191 = select i1 %190, float 3.000000e+00, float %189
+  %192 = fdiv float 1.000000e+00, %191
+  %193 = fdiv float 1.000000e+00, %30
+  %194 = fmul float %191, %193
+  %195 = fmul float %31, %194
+  %196 = fmul float %95, %95
+  %197 = fmul float %96, %96
+  %198 = fadd float %197, %196
+  %199 = fmul float %97, %97
+  %200 = fadd float %198, %199
+  %201 = call float @llvm.AMDGPU.rsq.f32(float %200)
+  %202 = fmul float %95, %201
+  %203 = fmul float %96, %201
+  %204 = fmul float %202, %29
+  %205 = fmul float %203, %29
+  %206 = fmul float %204, -1.000000e+00
+  %207 = fmul float %205, 1.000000e+00
+  %208 = fmul float %206, %32
+  %209 = fmul float %207, %32
+  %210 = fsub float -0.000000e+00, %208
+  %211 = fadd float %93, %210
+  %212 = fsub float -0.000000e+00, %209
+  %213 = fadd float %94, %212
+  %214 = fmul float %206, %192
+  %215 = fmul float %207, %192
+  %216 = fmul float -1.000000e+00, %192
+  %217 = bitcast float %136 to i32
+  %218 = bitcast float %182 to i32
+  %219 = bitcast float %137 to i32
+  %220 = bitcast float %183 to i32
+  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
+  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
+  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
+  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
+  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
+  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
+  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
+  %225 = fcmp oge float %temp24.0, %191
+  %226 = sext i1 %225 to i32
+  %227 = bitcast i32 %226 to float
+  %228 = bitcast float %227 to i32
+  %229 = icmp ne i32 %228, 0
+  br i1 %229, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %230 = bitcast float %136 to i32
+  %231 = bitcast float %182 to i32
+  %232 = bitcast float %137 to i32
+  %233 = bitcast float %183 to i32
+  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
+  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
+  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
+  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+  br label %LOOP65
+
+ENDIF:                                            ; preds = %LOOP
+  %238 = bitcast float %temp28.0 to i32
+  %239 = bitcast float %temp29.0 to i32
+  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
+  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
+  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
+  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
+  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %245 = extractelement <4 x float> %244, i32 3
+  %246 = fcmp oge float %temp30.0, %245
+  %247 = sext i1 %246 to i32
+  %248 = bitcast i32 %247 to float
+  %249 = bitcast float %248 to i32
+  %250 = and i32 %249, 1065353216
+  %251 = bitcast i32 %250 to float
+  %252 = fmul float %214, %251
+  %253 = fadd float %252, %temp28.0
+  %254 = fmul float %215, %251
+  %255 = fadd float %254, %temp29.0
+  %256 = fmul float %216, %251
+  %257 = fadd float %256, %temp30.0
+  %258 = fadd float %temp24.0, 1.000000e+00
+  br label %LOOP
+
+LOOP65:                                           ; preds = %ENDIF66, %IF
+  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
+  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
+  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
+  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
+  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
+  %259 = fcmp oge float %temp24.1, %195
+  %260 = sext i1 %259 to i32
+  %261 = bitcast i32 %260 to float
+  %262 = bitcast float %261 to i32
+  %263 = icmp ne i32 %262, 0
+  br i1 %263, label %IF67, label %ENDIF66
+
+IF67:                                             ; preds = %LOOP65
+  %264 = bitcast float %136 to i32
+  %265 = bitcast float %182 to i32
+  %266 = bitcast float %137 to i32
+  %267 = bitcast float %183 to i32
+  %268 = bitcast float %temp28.1 to i32
+  %269 = bitcast float %temp29.1 to i32
+  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
+  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
+  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
+  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
+  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
+  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
+  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
+  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
+  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
+  %279 = extractelement <4 x float> %278, i32 0
+  %280 = extractelement <4 x float> %278, i32 1
+  %281 = extractelement <4 x float> %278, i32 2
+  %282 = extractelement <4 x float> %278, i32 3
+  %283 = fmul float %282, %47
+  %284 = bitcast float %136 to i32
+  %285 = bitcast float %182 to i32
+  %286 = bitcast float %137 to i32
+  %287 = bitcast float %183 to i32
+  %288 = bitcast float %temp28.1 to i32
+  %289 = bitcast float %temp29.1 to i32
+  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
+  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
+  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
+  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
+  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
+  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
+  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
+  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
+  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
+  %299 = extractelement <4 x float> %298, i32 0
+  %300 = extractelement <4 x float> %298, i32 1
+  %301 = extractelement <4 x float> %298, i32 2
+  %302 = bitcast float %136 to i32
+  %303 = bitcast float %182 to i32
+  %304 = bitcast float %137 to i32
+  %305 = bitcast float %183 to i32
+  %306 = bitcast float %temp28.1 to i32
+  %307 = bitcast float %temp29.1 to i32
+  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
+  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
+  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
+  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
+  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
+  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
+  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
+  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
+  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
+  %317 = extractelement <4 x float> %316, i32 0
+  %318 = extractelement <4 x float> %316, i32 1
+  %319 = extractelement <4 x float> %316, i32 2
+  %320 = fmul float %317, %23
+  %321 = fmul float %318, %24
+  %322 = fmul float %319, %25
+  %323 = fmul float %299, %26
+  %324 = fadd float %323, %320
+  %325 = fmul float %300, %27
+  %326 = fadd float %325, %321
+  %327 = fmul float %301, %28
+  %328 = fadd float %327, %322
+  %329 = fadd float %279, %324
+  %330 = fadd float %280, %326
+  %331 = fadd float %281, %328
+  %332 = bitcast float %136 to i32
+  %333 = bitcast float %182 to i32
+  %334 = bitcast float %137 to i32
+  %335 = bitcast float %183 to i32
+  %336 = bitcast float %temp28.1 to i32
+  %337 = bitcast float %temp29.1 to i32
+  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
+  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
+  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
+  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
+  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
+  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
+  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
+  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
+  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = fadd float %347, -5.000000e-01
+  %351 = fadd float %348, -5.000000e-01
+  %352 = fadd float %349, -5.000000e-01
+  %353 = fmul float %350, %350
+  %354 = fmul float %351, %351
+  %355 = fadd float %354, %353
+  %356 = fmul float %352, %352
+  %357 = fadd float %355, %356
+  %358 = call float @llvm.AMDGPU.rsq.f32(float %357)
+  %359 = fmul float %350, %358
+  %360 = fmul float %351, %358
+  %361 = fmul float %352, %358
+  %362 = bitcast float %136 to i32
+  %363 = bitcast float %182 to i32
+  %364 = bitcast float %137 to i32
+  %365 = bitcast float %183 to i32
+  %366 = bitcast float %temp28.1 to i32
+  %367 = bitcast float %temp29.1 to i32
+  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
+  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
+  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
+  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
+  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
+  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
+  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
+  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
+  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
+  %377 = extractelement <4 x float> %376, i32 0
+  %378 = extractelement <4 x float> %376, i32 1
+  %379 = extractelement <4 x float> %376, i32 2
+  %380 = extractelement <4 x float> %376, i32 3
+  %381 = fsub float -0.000000e+00, %95
+  %382 = fsub float -0.000000e+00, %96
+  %383 = fsub float -0.000000e+00, %97
+  %384 = fmul float %359, %381
+  %385 = fmul float %360, %382
+  %386 = fadd float %385, %384
+  %387 = fmul float %361, %383
+  %388 = fadd float %386, %387
+  %389 = fmul float %388, %359
+  %390 = fmul float %388, %360
+  %391 = fmul float %388, %361
+  %392 = fmul float 2.000000e+00, %389
+  %393 = fmul float 2.000000e+00, %390
+  %394 = fmul float 2.000000e+00, %391
+  %395 = fsub float -0.000000e+00, %392
+  %396 = fadd float %381, %395
+  %397 = fsub float -0.000000e+00, %393
+  %398 = fadd float %382, %397
+  %399 = fsub float -0.000000e+00, %394
+  %400 = fadd float %383, %399
+  %401 = fmul float %396, %98
+  %402 = fmul float %396, %99
+  %403 = fmul float %396, %100
+  %404 = fmul float %398, %101
+  %405 = fadd float %404, %401
+  %406 = fmul float %398, %102
+  %407 = fadd float %406, %402
+  %408 = fmul float %398, %103
+  %409 = fadd float %408, %403
+  %410 = fmul float %400, %104
+  %411 = fadd float %410, %405
+  %412 = fmul float %400, %105
+  %413 = fadd float %412, %407
+  %414 = fmul float %400, %106
+  %415 = fadd float %414, %409
+  %416 = bitcast float %136 to i32
+  %417 = bitcast float %182 to i32
+  %418 = bitcast float %137 to i32
+  %419 = bitcast float %183 to i32
+  %420 = bitcast float %temp28.1 to i32
+  %421 = bitcast float %temp29.1 to i32
+  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
+  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
+  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
+  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
+  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
+  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
+  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
+  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
+  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
+  %431 = extractelement <4 x float> %430, i32 0
+  %432 = extractelement <4 x float> %430, i32 1
+  %433 = extractelement <4 x float> %430, i32 2
+  %434 = fmul float %48, %411
+  %435 = fmul float %49, %411
+  %436 = fmul float %50, %411
+  %437 = fmul float %51, %413
+  %438 = fadd float %437, %434
+  %439 = fmul float %52, %413
+  %440 = fadd float %439, %435
+  %441 = fmul float %53, %413
+  %442 = fadd float %441, %436
+  %443 = fmul float %54, %415
+  %444 = fadd float %443, %438
+  %445 = fmul float %55, %415
+  %446 = fadd float %445, %440
+  %447 = fmul float %56, %415
+  %448 = fadd float %447, %442
+  %449 = insertelement <4 x float> undef, float %444, i32 0
+  %450 = insertelement <4 x float> %449, float %446, i32 1
+  %451 = insertelement <4 x float> %450, float %448, i32 2
+  %452 = insertelement <4 x float> %451, float %195, i32 3
+  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
+  %454 = extractelement <4 x float> %453, i32 0
+  %455 = extractelement <4 x float> %453, i32 1
+  %456 = extractelement <4 x float> %453, i32 2
+  %457 = extractelement <4 x float> %453, i32 3
+  %458 = call float @fabs(float %456)
+  %459 = fdiv float 1.000000e+00, %458
+  %460 = fmul float %454, %459
+  %461 = fadd float %460, 1.500000e+00
+  %462 = fmul float %455, %459
+  %463 = fadd float %462, 1.500000e+00
+  %464 = bitcast float %463 to i32
+  %465 = bitcast float %461 to i32
+  %466 = bitcast float %457 to i32
+  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
+  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
+  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
+  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
+  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
+  %472 = extractelement <4 x float> %471, i32 0
+  %473 = extractelement <4 x float> %471, i32 1
+  %474 = extractelement <4 x float> %471, i32 2
+  %475 = fmul float %431, %472
+  %476 = fadd float %475, %329
+  %477 = fmul float %432, %473
+  %478 = fadd float %477, %330
+  %479 = fmul float %433, %474
+  %480 = fadd float %479, %331
+  %481 = fmul float %107, %107
+  %482 = fmul float %108, %108
+  %483 = fadd float %482, %481
+  %484 = fmul float %109, %109
+  %485 = fadd float %483, %484
+  %486 = call float @llvm.AMDGPU.rsq.f32(float %485)
+  %487 = fmul float %107, %486
+  %488 = fmul float %108, %486
+  %489 = fmul float %109, %486
+  %490 = fmul float %377, %40
+  %491 = fmul float %378, %41
+  %492 = fmul float %379, %42
+  %493 = fmul float %359, %487
+  %494 = fmul float %360, %488
+  %495 = fadd float %494, %493
+  %496 = fmul float %361, %489
+  %497 = fadd float %495, %496
+  %498 = fmul float %497, %359
+  %499 = fmul float %497, %360
+  %500 = fmul float %497, %361
+  %501 = fmul float 2.000000e+00, %498
+  %502 = fmul float 2.000000e+00, %499
+  %503 = fmul float 2.000000e+00, %500
+  %504 = fsub float -0.000000e+00, %501
+  %505 = fadd float %487, %504
+  %506 = fsub float -0.000000e+00, %502
+  %507 = fadd float %488, %506
+  %508 = fsub float -0.000000e+00, %503
+  %509 = fadd float %489, %508
+  %510 = fmul float %95, %95
+  %511 = fmul float %96, %96
+  %512 = fadd float %511, %510
+  %513 = fmul float %97, %97
+  %514 = fadd float %512, %513
+  %515 = call float @llvm.AMDGPU.rsq.f32(float %514)
+  %516 = fmul float %95, %515
+  %517 = fmul float %96, %515
+  %518 = fmul float %97, %515
+  %519 = fmul float %505, %516
+  %520 = fmul float %507, %517
+  %521 = fadd float %520, %519
+  %522 = fmul float %509, %518
+  %523 = fadd float %521, %522
+  %524 = fsub float -0.000000e+00, %523
+  %525 = fcmp uge float %524, 0.000000e+00
+  %526 = select i1 %525, float %524, float 0.000000e+00
+  %527 = fmul float %43, %380
+  %528 = fadd float %527, 1.000000e+00
+  %529 = call float @llvm.pow.f32(float %526, float %528)
+  %530 = fmul float %476, %37
+  %531 = fmul float %478, %38
+  %532 = fmul float %480, %39
+  %533 = fmul float %359, %487
+  %534 = fmul float %360, %488
+  %535 = fadd float %534, %533
+  %536 = fmul float %361, %489
+  %537 = fadd float %535, %536
+  %538 = fcmp uge float %537, 0.000000e+00
+  %539 = select i1 %538, float %537, float 0.000000e+00
+  %540 = fmul float %530, %539
+  %541 = fmul float %531, %539
+  %542 = fmul float %532, %539
+  %543 = fmul float %490, %529
+  %544 = fadd float %543, %540
+  %545 = fmul float %491, %529
+  %546 = fadd float %545, %541
+  %547 = fmul float %492, %529
+  %548 = fadd float %547, %542
+  %549 = fmul float %476, %34
+  %550 = fmul float %478, %35
+  %551 = fmul float %480, %36
+  %552 = fmul float %544, %57
+  %553 = fadd float %552, %549
+  %554 = fmul float %546, %58
+  %555 = fadd float %554, %550
+  %556 = fmul float %548, %59
+  %557 = fadd float %556, %551
+  %558 = bitcast float %136 to i32
+  %559 = bitcast float %182 to i32
+  %560 = bitcast float %137 to i32
+  %561 = bitcast float %183 to i32
+  %562 = bitcast float %temp28.1 to i32
+  %563 = bitcast float %temp29.1 to i32
+  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
+  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
+  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
+  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
+  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
+  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
+  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
+  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
+  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
+  %573 = extractelement <4 x float> %572, i32 0
+  %574 = extractelement <4 x float> %572, i32 1
+  %575 = extractelement <4 x float> %572, i32 2
+  %576 = fmul float %573, %44
+  %577 = fadd float %576, %553
+  %578 = fmul float %574, %45
+  %579 = fadd float %578, %555
+  %580 = fmul float %575, %46
+  %581 = fadd float %580, %557
+  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
+  %583 = bitcast i32 %582 to float
+  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
+  %585 = bitcast i32 %584 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+  ret void
+
+ENDIF66:                                          ; preds = %LOOP65
+  %586 = bitcast float %temp28.1 to i32
+  %587 = bitcast float %temp29.1 to i32
+  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
+  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
+  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
+  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
+  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %593 = extractelement <4 x float> %592, i32 3
+  %594 = fcmp oge float %temp30.1, %593
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = and i32 %597, 1065353216
+  %599 = bitcast i32 %598 to float
+  %600 = fmul float 5.000000e-01, %temp32.0
+  %601 = fsub float -0.000000e+00, %600
+  %602 = fmul float %599, %temp32.0
+  %603 = fadd float %602, %601
+  %604 = fmul float %214, %603
+  %605 = fadd float %604, %temp28.1
+  %606 = fmul float %215, %603
+  %607 = fadd float %606, %temp29.1
+  %608 = fmul float %216, %603
+  %609 = fadd float %608, %temp30.1
+  %610 = fadd float %temp24.1, 1.000000e+00
+  %611 = fmul float %temp32.0, 5.000000e-01
+  br label %LOOP65
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: readnone
+declare i32 @llvm.SI.tid() #2
+
+; Function Attrs: readonly
+declare float @ceil(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq.f32(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { readonly }
+attributes #4 = { nounwind readonly }
+
+!0 = !{!"const", null, i32 1}
+
+; CHECK-LABEL: {{^}}main1:
+; CHECK: s_endpgm
+define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260)
+  %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264)
+  %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268)
+  %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284)
+  %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464)
+  %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468)
+  %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472)
+  %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496)
+  %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500)
+  %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504)
+  %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512)
+  %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516)
+  %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524)
+  %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532)
+  %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536)
+  %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540)
+  %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544)
+  %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548)
+  %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552)
+  %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556)
+  %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560)
+  %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564)
+  %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568)
+  %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572)
+  %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576)
+  %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580)
+  %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584)
+  %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588)
+  %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592)
+  %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596)
+  %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600)
+  %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604)
+  %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608)
+  %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612)
+  %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616)
+  %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624)
+  %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628)
+  %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632)
+  %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636)
+  %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640)
+  %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644)
+  %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648)
+  %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652)
+  %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656)
+  %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660)
+  %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664)
+  %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668)
+  %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672)
+  %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676)
+  %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680)
+  %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684)
+  %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688)
+  %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692)
+  %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696)
+  %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700)
+  %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704)
+  %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708)
+  %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712)
+  %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716)
+  %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864)
+  %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868)
+  %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0
+  %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0
+  %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0
+  %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0
+  %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0
+  %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0
+  %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0
+  %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0
+  %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0
+  %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0
+  %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0
+  %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0
+  %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0
+  %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0
+  %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0
+  %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0
+  %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8
+  %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0
+  %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8
+  %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0
+  %162 = fcmp ugt float %17, 0.000000e+00
+  %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00
+  %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6)
+  %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6)
+  %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6)
+  %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6)
+  %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6)
+  %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6)
+  %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6)
+  %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6)
+  %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6)
+  %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6)
+  %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6)
+  %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6)
+  %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6)
+  %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6)
+  %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6)
+  %196 = fmul float %14, %124
+  %197 = fadd float %196, %125
+  %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00)
+  %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %202 = bitcast float %198 to i32
+  %203 = icmp ne i32 %202, 0
+  %. = select i1 %203, float -1.000000e+00, float 1.000000e+00
+  %204 = fsub float -0.000000e+00, %164
+  %205 = fadd float %44, %204
+  %206 = fsub float -0.000000e+00, %165
+  %207 = fadd float %45, %206
+  %208 = fsub float -0.000000e+00, %166
+  %209 = fadd float %46, %208
+  %210 = fmul float %205, %205
+  %211 = fmul float %207, %207
+  %212 = fadd float %211, %210
+  %213 = fmul float %209, %209
+  %214 = fadd float %212, %213
+  %215 = call float @llvm.AMDGPU.rsq.f32(float %214)
+  %216 = fmul float %205, %215
+  %217 = fmul float %207, %215
+  %218 = fmul float %209, %215
+  %219 = fmul float %., %54
+  %220 = fmul float %13, %47
+  %221 = fmul float %197, %48
+  %222 = bitcast float %174 to i32
+  %223 = bitcast float %175 to i32
+  %224 = insertelement <2 x i32> undef, i32 %222, i32 0
+  %225 = insertelement <2 x i32> %224, i32 %223, i32 1
+  %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2)
+  %227 = extractelement <4 x float> %226, i32 0
+  %228 = extractelement <4 x float> %226, i32 1
+  %229 = extractelement <4 x float> %226, i32 2
+  %230 = extractelement <4 x float> %226, i32 3
+  %231 = fmul float %227, 0x4012611180000000
+  %232 = fmul float %228, 0x4012611180000000
+  %233 = fmul float %229, 0x4012611180000000
+  %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00)
+  %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00)
+  %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00)
+  %237 = fmul float %216, %184
+  %238 = fmul float %217, %185
+  %239 = fadd float %238, %237
+  %240 = fmul float %218, %186
+  %241 = fadd float %239, %240
+  %242 = fmul float %216, %187
+  %243 = fmul float %217, %188
+  %244 = fadd float %243, %242
+  %245 = fmul float %218, %189
+  %246 = fadd float %244, %245
+  %247 = fmul float %216, %190
+  %248 = fmul float %217, %191
+  %249 = fadd float %248, %247
+  %250 = fmul float %218, %192
+  %251 = fadd float %249, %250
+  %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00)
+  %253 = fmul float %214, 0x3F5A36E2E0000000
+  %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00)
+  %255 = fsub float -0.000000e+00, %254
+  %256 = fadd float 1.000000e+00, %255
+  %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01)
+  %258 = fmul float %39, %257
+  %259 = fmul float %241, %258
+  %260 = fmul float %246, %258
+  %261 = fmul float %259, %230
+  %262 = fmul float %260, %230
+  %263 = fadd float %252, 0x3EE4F8B580000000
+  %264 = fsub float -0.000000e+00, %252
+  %265 = fadd float 1.000000e+00, %264
+  %266 = fmul float 1.200000e+01, %265
+  %267 = fadd float %266, 4.000000e+00
+  %268 = fsub float -0.000000e+00, %267
+  %269 = fmul float %268, %263
+  %270 = fsub float -0.000000e+00, %267
+  %271 = fmul float %270, %263
+  %272 = fsub float -0.000000e+00, %267
+  %273 = fmul float %272, %263
+  %274 = fdiv float 1.000000e+00, %269
+  %275 = fdiv float 1.000000e+00, %271
+  %276 = fdiv float 1.000000e+00, %273
+  %277 = fmul float %261, %274
+  %278 = fmul float %262, %275
+  %279 = fmul float %263, %276
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP, %main_body
+  %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ]
+  %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ]
+  %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ]
+  %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ]
+  %280 = bitcast float %temp168.0 to i32
+  %281 = bitcast float %temp169.0 to i32
+  %282 = insertelement <4 x i32> undef, i32 %280, i32 0
+  %283 = insertelement <4 x i32> %282, i32 %281, i32 1
+  %284 = insertelement <4 x i32> %283, i32 0, i32 2
+  %285 = insertelement <4 x i32> %284, i32 undef, i32 3
+  %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2)
+  %287 = extractelement <4 x float> %286, i32 3
+  %288 = fadd float %temp168.0, %277
+  %289 = fadd float %temp169.0, %278
+  %290 = fadd float %temp170.0, %279
+  %291 = fsub float -0.000000e+00, %287
+  %292 = fadd float %290, %291
+  %293 = fcmp oge float 0.000000e+00, %292
+  %294 = sext i1 %293 to i32
+  %295 = bitcast i32 %294 to float
+  %296 = bitcast float %295 to i32
+  %297 = icmp ne i32 %296, 0
+  br i1 %297, label %IF189, label %LOOP
+
+IF189:                                            ; preds = %LOOP
+  %298 = extractelement <4 x float> %286, i32 0
+  %299 = extractelement <4 x float> %286, i32 1
+  %300 = extractelement <4 x float> %286, i32 2
+  %301 = fsub float -0.000000e+00, %292
+  %302 = fadd float %temp144.0, %301
+  %303 = fdiv float 1.000000e+00, %302
+  %304 = fmul float %292, %303
+  %305 = fadd float %304, -1.000000e+00
+  %306 = fmul float %305, %277
+  %307 = fadd float %306, %288
+  %308 = fmul float %305, %278
+  %309 = fadd float %308, %289
+  %310 = fsub float -0.000000e+00, %176
+  %311 = fadd float %307, %310
+  %312 = fsub float -0.000000e+00, %177
+  %313 = fadd float %309, %312
+  %314 = fadd float %176, %311
+  %315 = fadd float %177, %313
+  %316 = fmul float %311, %67
+  %317 = fmul float %313, %68
+  %318 = fmul float %316, %55
+  %319 = fmul float %316, %56
+  %320 = fmul float %317, %57
+  %321 = fadd float %320, %318
+  %322 = fmul float %317, %58
+  %323 = fadd float %322, %319
+  %324 = fadd float %178, %321
+  %325 = fadd float %179, %323
+  %326 = fmul float %316, %59
+  %327 = fmul float %316, %60
+  %328 = fmul float %316, %61
+  %329 = fmul float %316, %62
+  %330 = fmul float %317, %63
+  %331 = fadd float %330, %326
+  %332 = fmul float %317, %64
+  %333 = fadd float %332, %327
+  %334 = fmul float %317, %65
+  %335 = fadd float %334, %328
+  %336 = fmul float %317, %66
+  %337 = fadd float %336, %329
+  %338 = fadd float %168, %331
+  %339 = fadd float %169, %333
+  %340 = fadd float %170, %335
+  %341 = fadd float %171, %337
+  %342 = bitcast float %338 to i32
+  %343 = bitcast float %339 to i32
+  %344 = insertelement <2 x i32> undef, i32 %342, i32 0
+  %345 = insertelement <2 x i32> %344, i32 %343, i32 1
+  %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = extractelement <4 x float> %346, i32 3
+  %351 = fmul float %347, %23
+  %352 = fmul float %348, %24
+  %353 = fmul float %349, %25
+  %354 = fmul float %350, %26
+  %355 = fmul float %351, %180
+  %356 = fmul float %352, %181
+  %357 = fmul float %353, %182
+  %358 = fmul float %354, %183
+  %359 = fsub float -0.000000e+00, %350
+  %360 = fadd float 1.000000e+00, %359
+  %361 = fmul float %360, %49
+  %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355)
+  %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356)
+  %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357)
+  %365 = bitcast float %340 to i32
+  %366 = bitcast float %341 to i32
+  %367 = insertelement <2 x i32> undef, i32 %365, i32 0
+  %368 = insertelement <2 x i32> %367, i32 %366, i32 1
+  %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2)
+  %370 = extractelement <4 x float> %369, i32 2
+  %371 = fmul float %362, %234
+  %372 = fmul float %363, %235
+  %373 = fmul float %364, %236
+  %374 = fmul float %358, %230
+  %375 = bitcast float %314 to i32
+  %376 = bitcast float %315 to i32
+  %377 = insertelement <2 x i32> undef, i32 %375, i32 0
+  %378 = insertelement <2 x i32> %377, i32 %376, i32 1
+  %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2)
+  %380 = extractelement <4 x float> %379, i32 0
+  %381 = extractelement <4 x float> %379, i32 1
+  %382 = extractelement <4 x float> %379, i32 2
+  %383 = extractelement <4 x float> %379, i32 3
+  %384 = fcmp olt float 0.000000e+00, %382
+  %385 = sext i1 %384 to i32
+  %386 = bitcast i32 %385 to float
+  %387 = bitcast float %386 to i32
+  %388 = icmp ne i32 %387, 0
+  %.224 = select i1 %388, float %381, float %380
+  %.225 = select i1 %388, float %383, float %381
+  %389 = bitcast float %324 to i32
+  %390 = bitcast float %325 to i32
+  %391 = insertelement <2 x i32> undef, i32 %389, i32 0
+  %392 = insertelement <2 x i32> %391, i32 %390, i32 1
+  %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2)
+  %394 = extractelement <4 x float> %393, i32 0
+  %395 = extractelement <4 x float> %393, i32 1
+  %396 = extractelement <4 x float> %393, i32 2
+  %397 = extractelement <4 x float> %393, i32 3
+  %398 = fcmp olt float 0.000000e+00, %396
+  %399 = sext i1 %398 to i32
+  %400 = bitcast i32 %399 to float
+  %401 = bitcast float %400 to i32
+  %402 = icmp ne i32 %401, 0
+  %temp112.1 = select i1 %402, float %395, float %394
+  %temp113.1 = select i1 %402, float %397, float %395
+  %403 = fmul float %.224, 2.000000e+00
+  %404 = fadd float %403, -1.000000e+00
+  %405 = fmul float %.225, 2.000000e+00
+  %406 = fadd float %405, -1.000000e+00
+  %407 = fmul float %temp112.1, 2.000000e+00
+  %408 = fadd float %407, -1.000000e+00
+  %409 = fmul float %temp113.1, 2.000000e+00
+  %410 = fadd float %409, -1.000000e+00
+  %411 = fsub float -0.000000e+00, %404
+  %412 = fmul float %411, %35
+  %413 = fsub float -0.000000e+00, %406
+  %414 = fmul float %413, %35
+  %415 = fsub float -0.000000e+00, %408
+  %416 = fmul float %415, %36
+  %417 = fsub float -0.000000e+00, %410
+  %418 = fmul float %417, %36
+  %419 = fmul float %416, %370
+  %420 = fmul float %418, %370
+  %421 = call float @fabs(float %412)
+  %422 = call float @fabs(float %414)
+  %423 = fsub float -0.000000e+00, %421
+  %424 = fadd float 1.000000e+00, %423
+  %425 = fsub float -0.000000e+00, %422
+  %426 = fadd float 1.000000e+00, %425
+  %427 = fmul float %424, %419
+  %428 = fadd float %427, %412
+  %429 = fmul float %426, %420
+  %430 = fadd float %429, %414
+  %431 = fmul float %428, %428
+  %432 = fmul float %430, %430
+  %433 = fadd float %431, %432
+  %434 = fsub float -0.000000e+00, %433
+  %435 = fadd float 0x3FF00068E0000000, %434
+  %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
+  %437 = call float @llvm.AMDGPU.rsq.f32(float %436)
+  %438 = fmul float %437, %436
+  %439 = fsub float -0.000000e+00, %436
+  %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
+  %441 = fmul float %184, %428
+  %442 = fmul float %185, %428
+  %443 = fmul float %186, %428
+  %444 = fmul float %187, %430
+  %445 = fadd float %444, %441
+  %446 = fmul float %188, %430
+  %447 = fadd float %446, %442
+  %448 = fmul float %189, %430
+  %449 = fadd float %448, %443
+  %450 = fmul float %190, %440
+  %451 = fadd float %450, %445
+  %452 = fmul float %191, %440
+  %453 = fadd float %452, %447
+  %454 = fmul float %192, %440
+  %455 = fadd float %454, %449
+  %456 = fmul float %451, %451
+  %457 = fmul float %453, %453
+  %458 = fadd float %457, %456
+  %459 = fmul float %455, %455
+  %460 = fadd float %458, %459
+  %461 = call float @llvm.AMDGPU.rsq.f32(float %460)
+  %462 = fmul float %451, %461
+  %463 = fmul float %453, %461
+  %464 = fmul float %455, %461
+  %465 = fcmp olt float 0.000000e+00, %219
+  %466 = sext i1 %465 to i32
+  %467 = bitcast i32 %466 to float
+  %468 = bitcast float %467 to i32
+  %469 = icmp ne i32 %468, 0
+  br i1 %469, label %IF198, label %ENDIF197
+
+IF198:                                            ; preds = %IF189
+  %470 = fsub float -0.000000e+00, %462
+  %471 = fsub float -0.000000e+00, %463
+  %472 = fsub float -0.000000e+00, %464
+  br label %ENDIF197
+
+ENDIF197:                                         ; preds = %IF189, %IF198
+  %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ]
+  %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ]
+  %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ]
+  %473 = bitcast float %220 to i32
+  %474 = bitcast float %221 to i32
+  %475 = insertelement <2 x i32> undef, i32 %473, i32 0
+  %476 = insertelement <2 x i32> %475, i32 %474, i32 1
+  %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2)
+  %478 = extractelement <4 x float> %477, i32 0
+  %479 = extractelement <4 x float> %477, i32 1
+  %480 = extractelement <4 x float> %477, i32 2
+  %481 = extractelement <4 x float> %477, i32 3
+  %482 = fmul float %478, %40
+  %483 = fadd float %482, %41
+  %484 = fmul float %479, %40
+  %485 = fadd float %484, %41
+  %486 = fmul float %480, %40
+  %487 = fadd float %486, %41
+  %488 = fmul float %481, %42
+  %489 = fadd float %488, %43
+  %490 = bitcast float %172 to i32
+  %491 = bitcast float %173 to i32
+  %492 = insertelement <2 x i32> undef, i32 %490, i32 0
+  %493 = insertelement <2 x i32> %492, i32 %491, i32 1
+  %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2)
+  %495 = extractelement <4 x float> %494, i32 0
+  %496 = extractelement <4 x float> %494, i32 1
+  %497 = extractelement <4 x float> %494, i32 2
+  %498 = extractelement <4 x float> %494, i32 3
+  %499 = fmul float %498, 3.200000e+01
+  %500 = fadd float %499, -1.600000e+01
+  %501 = call float @llvm.AMDIL.exp.(float %500)
+  %502 = fmul float %495, %501
+  %503 = fmul float %496, %501
+  %504 = fmul float %497, %501
+  %505 = fmul float %28, %502
+  %506 = fadd float %505, %193
+  %507 = fmul float %29, %503
+  %508 = fadd float %507, %194
+  %509 = fmul float %30, %504
+  %510 = fadd float %509, %195
+  %511 = fmul float %506, %489
+  %512 = fmul float %508, %489
+  %513 = fmul float %510, %489
+  %514 = fmul float %489, 5.000000e-01
+  %515 = fadd float %514, 5.000000e-01
+  %516 = fmul float %483, %515
+  %517 = fadd float %516, %511
+  %518 = fmul float %485, %515
+  %519 = fadd float %518, %512
+  %520 = fmul float %487, %515
+  %521 = fadd float %520, %513
+  %522 = fmul float %517, %371
+  %523 = fmul float %519, %372
+  %524 = fmul float %521, %373
+  %525 = fmul float %428, 0x3FDB272440000000
+  %526 = fmul float %430, 0xBFDB272440000000
+  %527 = fadd float %526, %525
+  %528 = fmul float %440, 0x3FE99999A0000000
+  %529 = fadd float %527, %528
+  %530 = fmul float %529, 5.000000e-01
+  %531 = fadd float %530, 0x3FE3333340000000
+  %532 = fmul float %531, %531
+  %533 = fmul float %522, %532
+  %534 = fmul float %523, %532
+  %535 = fmul float %524, %532
+  %536 = fsub float -0.000000e+00, %72
+  %537 = fsub float -0.000000e+00, %73
+  %538 = fsub float -0.000000e+00, %74
+  %539 = fmul float %temp12.0, %536
+  %540 = fmul float %temp13.0, %537
+  %541 = fadd float %540, %539
+  %542 = fmul float %temp14.0, %538
+  %543 = fadd float %541, %542
+  %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00)
+  %545 = fmul float %371, %544
+  %546 = fmul float %372, %544
+  %547 = fmul float %373, %544
+  %548 = fmul float %545, %69
+  %549 = fmul float %546, %70
+  %550 = fmul float %547, %71
+  %551 = fsub float -0.000000e+00, %164
+  %552 = fadd float %97, %551
+  %553 = fsub float -0.000000e+00, %165
+  %554 = fadd float %98, %553
+  %555 = fsub float -0.000000e+00, %166
+  %556 = fadd float %99, %555
+  %557 = fmul float %552, %552
+  %558 = fmul float %554, %554
+  %559 = fadd float %558, %557
+  %560 = fmul float %556, %556
+  %561 = fadd float %559, %560
+  %562 = call float @llvm.AMDGPU.rsq.f32(float %561)
+  %563 = fmul float %562, %561
+  %564 = fsub float -0.000000e+00, %561
+  %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
+  %566 = fsub float -0.000000e+00, %84
+  %567 = fadd float %565, %566
+  %568 = fsub float -0.000000e+00, %83
+  %569 = fadd float %565, %568
+  %570 = fsub float -0.000000e+00, %82
+  %571 = fadd float %565, %570
+  %572 = fsub float -0.000000e+00, %84
+  %573 = fadd float %83, %572
+  %574 = fsub float -0.000000e+00, %83
+  %575 = fadd float %82, %574
+  %576 = fsub float -0.000000e+00, %82
+  %577 = fadd float %81, %576
+  %578 = fdiv float 1.000000e+00, %573
+  %579 = fdiv float 1.000000e+00, %575
+  %580 = fdiv float 1.000000e+00, %577
+  %581 = fmul float %567, %578
+  %582 = fmul float %569, %579
+  %583 = fmul float %571, %580
+  %584 = fcmp olt float %565, %83
+  %585 = sext i1 %584 to i32
+  %586 = bitcast i32 %585 to float
+  %587 = bitcast float %586 to i32
+  %588 = icmp ne i32 %587, 0
+  br i1 %588, label %ENDIF200, label %ELSE202
+
+ELSE202:                                          ; preds = %ENDIF197
+  %589 = fcmp olt float %565, %82
+  %590 = sext i1 %589 to i32
+  %591 = bitcast i32 %590 to float
+  %592 = bitcast float %591 to i32
+  %593 = icmp ne i32 %592, 0
+  br i1 %593, label %ENDIF200, label %ELSE205
+
+ENDIF200:                                         ; preds = %ELSE205, %ELSE202, %ENDIF197
+  %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ]
+  %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ]
+  %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ]
+  %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ]
+  %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ]
+  %594 = fcmp olt float %565, %83
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = icmp ne i32 %597, 0
+  br i1 %598, label %ENDIF209, label %ELSE211
+
+ELSE205:                                          ; preds = %ELSE202
+  %599 = fcmp olt float %565, %81
+  %600 = sext i1 %599 to i32
+  %601 = bitcast i32 %600 to float
+  %602 = bitcast float %601 to i32
+  %603 = icmp ne i32 %602, 0
+  %.226 = select i1 %603, float %583, float 1.000000e+00
+  %.227 = select i1 %603, float %118, float %116
+  %.228 = select i1 %603, float %119, float %117
+  br label %ENDIF200
+
+ELSE211:                                          ; preds = %ENDIF200
+  %604 = fcmp olt float %565, %82
+  %605 = sext i1 %604 to i32
+  %606 = bitcast i32 %605 to float
+  %607 = bitcast float %606 to i32
+  %608 = icmp ne i32 %607, 0
+  br i1 %608, label %ENDIF209, label %ELSE214
+
+ENDIF209:                                         ; preds = %ELSE214, %ELSE211, %ENDIF200
+  %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ]
+  %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ]
+  %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ]
+  %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ]
+  %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ]
+  %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ]
+  %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ]
+  %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ]
+  %609 = fmul float %164, %85
+  %610 = fmul float %165, %86
+  %611 = fadd float %609, %610
+  %612 = fmul float %166, %87
+  %613 = fadd float %611, %612
+  %614 = fmul float %167, %88
+  %615 = fadd float %613, %614
+  %616 = fmul float %164, %89
+  %617 = fmul float %165, %90
+  %618 = fadd float %616, %617
+  %619 = fmul float %166, %91
+  %620 = fadd float %618, %619
+  %621 = fmul float %167, %92
+  %622 = fadd float %620, %621
+  %623 = fmul float %164, %93
+  %624 = fmul float %165, %94
+  %625 = fadd float %623, %624
+  %626 = fmul float %166, %95
+  %627 = fadd float %625, %626
+  %628 = fmul float %167, %96
+  %629 = fadd float %627, %628
+  %630 = fsub float -0.000000e+00, %78
+  %631 = fadd float 1.000000e+00, %630
+  %632 = call float @fabs(float %615)
+  %633 = call float @fabs(float %622)
+  %634 = fcmp oge float %631, %632
+  %635 = sext i1 %634 to i32
+  %636 = bitcast i32 %635 to float
+  %637 = bitcast float %636 to i32
+  %638 = and i32 %637, 1065353216
+  %639 = bitcast i32 %638 to float
+  %640 = fcmp oge float %631, %633
+  %641 = sext i1 %640 to i32
+  %642 = bitcast i32 %641 to float
+  %643 = bitcast float %642 to i32
+  %644 = and i32 %643, 1065353216
+  %645 = bitcast i32 %644 to float
+  %646 = fmul float %639, %645
+  %647 = fmul float %629, %646
+  %648 = fmul float %615, %temp68.0
+  %649 = fadd float %648, %temp70.0
+  %650 = fmul float %622, %temp69.0
+  %651 = fadd float %650, %temp71.0
+  %652 = fmul float %615, %temp52.0
+  %653 = fadd float %652, %temp54.0
+  %654 = fmul float %622, %temp53.0
+  %655 = fadd float %654, %temp55.0
+  %656 = fadd float %temp80.0, -1.000000e+00
+  %657 = fmul float %656, %77
+  %658 = fadd float %657, 1.000000e+00
+  %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00)
+  %660 = bitcast float %649 to i32
+  %661 = bitcast float %651 to i32
+  %662 = bitcast float 0.000000e+00 to i32
+  %663 = insertelement <4 x i32> undef, i32 %660, i32 0
+  %664 = insertelement <4 x i32> %663, i32 %661, i32 1
+  %665 = insertelement <4 x i32> %664, i32 %662, i32 2
+  %666 = insertelement <4 x i32> %665, i32 undef, i32 3
+  %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %668 = extractelement <4 x float> %667, i32 0
+  %669 = extractelement <4 x float> %667, i32 1
+  %670 = bitcast float %653 to i32
+  %671 = bitcast float %655 to i32
+  %672 = bitcast float 0.000000e+00 to i32
+  %673 = insertelement <4 x i32> undef, i32 %670, i32 0
+  %674 = insertelement <4 x i32> %673, i32 %671, i32 1
+  %675 = insertelement <4 x i32> %674, i32 %672, i32 2
+  %676 = insertelement <4 x i32> %675, i32 undef, i32 3
+  %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2)
+  %678 = extractelement <4 x float> %677, i32 0
+  %679 = extractelement <4 x float> %677, i32 1
+  %680 = fsub float -0.000000e+00, %669
+  %681 = fadd float 1.000000e+00, %680
+  %682 = fsub float -0.000000e+00, %679
+  %683 = fadd float 1.000000e+00, %682
+  %684 = fmul float %681, 2.500000e-01
+  %685 = fmul float %683, 2.500000e-01
+  %686 = fsub float -0.000000e+00, %684
+  %687 = fadd float %668, %686
+  %688 = fsub float -0.000000e+00, %685
+  %689 = fadd float %678, %688
+  %690 = fmul float %647, %temp88.0
+  %691 = fadd float %690, %temp89.0
+  %692 = fmul float %647, %temp90.0
+  %693 = fadd float %692, %temp91.0
+  %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00)
+  %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00)
+  %696 = fsub float -0.000000e+00, %694
+  %697 = fadd float %668, %696
+  %698 = fsub float -0.000000e+00, %695
+  %699 = fadd float %678, %698
+  %700 = fmul float %668, %668
+  %701 = fmul float %678, %678
+  %702 = fsub float -0.000000e+00, %700
+  %703 = fadd float %687, %702
+  %704 = fsub float -0.000000e+00, %701
+  %705 = fadd float %689, %704
+  %706 = fcmp uge float %703, %75
+  %707 = select i1 %706, float %703, float %75
+  %708 = fcmp uge float %705, %75
+  %709 = select i1 %708, float %705, float %75
+  %710 = fmul float %697, %697
+  %711 = fadd float %710, %707
+  %712 = fmul float %699, %699
+  %713 = fadd float %712, %709
+  %714 = fdiv float 1.000000e+00, %711
+  %715 = fdiv float 1.000000e+00, %713
+  %716 = fmul float %707, %714
+  %717 = fmul float %709, %715
+  %718 = fcmp oge float %697, 0.000000e+00
+  %719 = sext i1 %718 to i32
+  %720 = bitcast i32 %719 to float
+  %721 = bitcast float %720 to i32
+  %722 = icmp ne i32 %721, 0
+  %.229 = select i1 %722, float 1.000000e+00, float %716
+  %723 = fcmp oge float %699, 0.000000e+00
+  %724 = sext i1 %723 to i32
+  %725 = bitcast i32 %724 to float
+  %726 = bitcast float %725 to i32
+  %727 = icmp ne i32 %726, 0
+  %temp28.0 = select i1 %727, float 1.000000e+00, float %717
+  %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229)
+  %729 = call float @llvm.pow.f32(float %728, float %76)
+  %730 = fmul float %729, %79
+  %731 = fadd float %730, %80
+  %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00)
+  %733 = fmul float %732, %732
+  %734 = fmul float 2.000000e+00, %732
+  %735 = fsub float -0.000000e+00, %734
+  %736 = fadd float 3.000000e+00, %735
+  %737 = fmul float %733, %736
+  %738 = fmul float %548, %737
+  %739 = fmul float %549, %737
+  %740 = fmul float %550, %737
+  %741 = fmul float %738, %515
+  %742 = fadd float %741, %533
+  %743 = fmul float %739, %515
+  %744 = fadd float %743, %534
+  %745 = fmul float %740, %515
+  %746 = fadd float %745, %535
+  %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00)
+  %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00)
+  %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00)
+  %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00)
+  %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00)
+  %752 = fmul float %748, %751
+  %753 = fmul float %749, %751
+  %754 = fmul float %750, %751
+  %755 = fmul float %742, %752
+  %756 = fmul float %744, %753
+  %757 = fmul float %746, %754
+  %758 = fmul float %temp12.0, %216
+  %759 = fmul float %temp13.0, %217
+  %760 = fadd float %759, %758
+  %761 = fmul float %temp14.0, %218
+  %762 = fadd float %760, %761
+  %763 = call float @fabs(float %762)
+  %764 = fmul float %763, %763
+  %765 = fmul float %764, %50
+  %766 = fadd float %765, %51
+  %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00)
+  %768 = fsub float -0.000000e+00, %767
+  %769 = fadd float 1.000000e+00, %768
+  %770 = fmul float %33, %769
+  %771 = fmul float %33, %769
+  %772 = fmul float %33, %769
+  %773 = fmul float %34, %769
+  %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755)
+  %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756)
+  %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757)
+  %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374)
+  %778 = fcmp uge float %774, 0x3E6FFFFE60000000
+  %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000
+  %780 = fcmp uge float %775, 0x3E6FFFFE60000000
+  %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000
+  %782 = fcmp uge float %776, 0x3E6FFFFE60000000
+  %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000
+  %784 = fcmp uge float %779, 6.550400e+04
+  %785 = select i1 %784, float 6.550400e+04, float %779
+  %786 = fcmp uge float %781, 6.550400e+04
+  %787 = select i1 %786, float 6.550400e+04, float %781
+  %788 = fcmp uge float %783, 6.550400e+04
+  %789 = select i1 %788, float 6.550400e+04, float %783
+  %790 = fmul float %777, %52
+  %791 = fadd float %790, %53
+  %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00)
+  %793 = call i32 @llvm.SI.packf16(float %785, float %787)
+  %794 = bitcast i32 %793 to float
+  %795 = call i32 @llvm.SI.packf16(float %789, float %792)
+  %796 = bitcast i32 %795 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796)
+  ret void
+
+ELSE214:                                          ; preds = %ELSE211
+  %797 = fcmp olt float %565, %81
+  %798 = sext i1 %797 to i32
+  %799 = bitcast i32 %798 to float
+  %800 = bitcast float %799 to i32
+  %801 = icmp ne i32 %800, 0
+  %.230 = select i1 %801, float %104, float %100
+  %.231 = select i1 %801, float %105, float %101
+  %.232 = select i1 %801, float %106, float %102
+  %.233 = select i1 %801, float %107, float %103
+  br label %ENDIF209
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.cndlt(float, float, float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #2
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { nounwind readonly }
+attributes #4 = { readonly }
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
new file mode 100644
index 000000000000..4b2d8ec6bf0a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -0,0 +1,501 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; If this occurs it is likely due to reordering and the restore was
+; originally supposed to happen before SI_END_CF.
+; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
+; SI-NOT: v_readlane_b32 [[SAVED]]
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF2795, %main_body
+  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
+  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
+  %67 = icmp sgt i32 undef, 4
+  br i1 %67, label %ENDLOOP, label %ENDIF
+
+ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
+  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %69 = fsub float %2, undef
+  %70 = fsub float %3, undef
+  %71 = fsub float %4, undef
+  %72 = fmul float %69, 0.000000e+00
+  %73 = fmul float %70, undef
+  %74 = fmul float %71, undef
+  %75 = fsub float %6, undef
+  %76 = fsub float %7, undef
+  %77 = fmul float %75, undef
+  %78 = fmul float %76, 0.000000e+00
+  %79 = call float @llvm.minnum.f32(float %74, float %78)
+  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
+  %81 = call float @llvm.maxnum.f32(float %73, float %77)
+  %82 = call float @llvm.maxnum.f32(float undef, float %79)
+  %83 = call float @llvm.minnum.f32(float %80, float %81)
+  %84 = call float @llvm.minnum.f32(float %83, float undef)
+  %85 = fsub float %14, undef
+  %86 = fsub float %15, undef
+  %87 = fsub float %16, undef
+  %88 = fmul float %85, undef
+  %89 = fmul float %86, undef
+  %90 = fmul float %87, undef
+  %91 = fsub float %17, undef
+  %92 = fsub float %18, undef
+  %93 = fsub float %19, undef
+  %94 = fmul float %91, 0.000000e+00
+  %95 = fmul float %92, undef
+  %96 = fmul float %93, undef
+  %97 = call float @llvm.minnum.f32(float %89, float %95)
+  %98 = call float @llvm.maxnum.f32(float %88, float %94)
+  %99 = call float @llvm.maxnum.f32(float %90, float %96)
+  %100 = call float @llvm.maxnum.f32(float undef, float %97)
+  %101 = call float @llvm.maxnum.f32(float %100, float undef)
+  %102 = call float @llvm.minnum.f32(float %98, float undef)
+  %103 = call float @llvm.minnum.f32(float %102, float %99)
+  %104 = fsub float %30, undef
+  %105 = fsub float %31, undef
+  %106 = fmul float %104, 0.000000e+00
+  %107 = fmul float %105, 0.000000e+00
+  %108 = call float @llvm.minnum.f32(float undef, float %106)
+  %109 = call float @llvm.maxnum.f32(float undef, float %107)
+  %110 = call float @llvm.maxnum.f32(float undef, float %108)
+  %111 = call float @llvm.maxnum.f32(float %110, float undef)
+  %112 = call float @llvm.minnum.f32(float undef, float %109)
+  %113 = fsub float %32, undef
+  %114 = fsub float %33, undef
+  %115 = fsub float %34, undef
+  %116 = fmul float %113, 0.000000e+00
+  %117 = fmul float %114, undef
+  %118 = fmul float %115, undef
+  %119 = fsub float %35, undef
+  %120 = fsub float %36, undef
+  %121 = fsub float %37, undef
+  %122 = fmul float %119, undef
+  %123 = fmul float %120, undef
+  %124 = fmul float %121, undef
+  %125 = call float @llvm.minnum.f32(float %116, float %122)
+  %126 = call float @llvm.minnum.f32(float %117, float %123)
+  %127 = call float @llvm.minnum.f32(float %118, float %124)
+  %128 = call float @llvm.maxnum.f32(float %125, float %126)
+  %129 = call float @llvm.maxnum.f32(float %128, float %127)
+  %130 = fsub float %38, undef
+  %131 = fsub float %39, undef
+  %132 = fsub float %40, undef
+  %133 = fmul float %130, 0.000000e+00
+  %134 = fmul float %131, undef
+  %135 = fmul float %132, undef
+  %136 = fsub float %41, undef
+  %137 = fsub float %42, undef
+  %138 = fsub float %43, undef
+  %139 = fmul float %136, undef
+  %140 = fmul float %137, undef
+  %141 = fmul float %138, undef
+  %142 = call float @llvm.minnum.f32(float %133, float %139)
+  %143 = call float @llvm.minnum.f32(float %134, float %140)
+  %144 = call float @llvm.minnum.f32(float %135, float %141)
+  %145 = call float @llvm.maxnum.f32(float %142, float %143)
+  %146 = call float @llvm.maxnum.f32(float %145, float %144)
+  %147 = fsub float %44, undef
+  %148 = fsub float %45, undef
+  %149 = fsub float %46, undef
+  %150 = fmul float %147, 0.000000e+00
+  %151 = fmul float %148, 0.000000e+00
+  %152 = fmul float %149, undef
+  %153 = fsub float %47, undef
+  %154 = fsub float %48, undef
+  %155 = fsub float %49, undef
+  %156 = fmul float %153, undef
+  %157 = fmul float %154, 0.000000e+00
+  %158 = fmul float %155, undef
+  %159 = call float @llvm.minnum.f32(float %150, float %156)
+  %160 = call float @llvm.minnum.f32(float %151, float %157)
+  %161 = call float @llvm.minnum.f32(float %152, float %158)
+  %162 = call float @llvm.maxnum.f32(float %159, float %160)
+  %163 = call float @llvm.maxnum.f32(float %162, float %161)
+  %164 = fsub float %50, undef
+  %165 = fsub float %51, undef
+  %166 = fsub float %52, undef
+  %167 = fmul float %164, undef
+  %168 = fmul float %165, 0.000000e+00
+  %169 = fmul float %166, 0.000000e+00
+  %170 = fsub float %53, undef
+  %171 = fsub float %54, undef
+  %172 = fsub float %55, undef
+  %173 = fdiv float 1.000000e+00, %temp18.0
+  %174 = fmul float %170, undef
+  %175 = fmul float %171, undef
+  %176 = fmul float %172, %173
+  %177 = call float @llvm.minnum.f32(float %167, float %174)
+  %178 = call float @llvm.minnum.f32(float %168, float %175)
+  %179 = call float @llvm.minnum.f32(float %169, float %176)
+  %180 = call float @llvm.maxnum.f32(float %177, float %178)
+  %181 = call float @llvm.maxnum.f32(float %180, float %179)
+  %182 = fsub float %62, undef
+  %183 = fsub float %63, undef
+  %184 = fsub float %64, undef
+  %185 = fmul float %182, 0.000000e+00
+  %186 = fmul float %183, undef
+  %187 = fmul float %184, undef
+  %188 = fsub float %65, undef
+  %189 = fsub float %66, undef
+  %190 = fmul float %188, undef
+  %191 = fmul float %189, undef
+  %192 = call float @llvm.maxnum.f32(float %185, float %190)
+  %193 = call float @llvm.maxnum.f32(float %186, float %191)
+  %194 = call float @llvm.maxnum.f32(float %187, float undef)
+  %195 = call float @llvm.minnum.f32(float %192, float %193)
+  %196 = call float @llvm.minnum.f32(float %195, float %194)
+  %.temp292.7 = select i1 undef, float %163, float undef
+  %temp292.9 = select i1 false, float %181, float %.temp292.7
+  %.temp292.9 = select i1 undef, float undef, float %temp292.9
+  %197 = fcmp ogt float undef, 0.000000e+00
+  %198 = fcmp olt float undef, %196
+  %199 = and i1 %197, %198
+  %200 = fcmp olt float undef, %.temp292.9
+  %201 = and i1 %199, %200
+  %temp292.11 = select i1 %201, float undef, float %.temp292.9
+  br i1 undef, label %IF2565, label %ELSE2566
+
+IF2565:                                           ; preds = %ENDIF
+  br i1 false, label %ENDIF2582, label %ELSE2584
+
+ELSE2566:                                         ; preds = %ENDIF
+  %202 = fcmp oeq float %temp292.11, 1.000000e+04
+  br i1 %202, label %ENDLOOP, label %ELSE2593
+
+ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
+  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
+  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %203 = fsub float %5, undef
+  %204 = fmul float %203, undef
+  %205 = call float @llvm.maxnum.f32(float undef, float %204)
+  %206 = call float @llvm.minnum.f32(float %205, float undef)
+  %207 = call float @llvm.minnum.f32(float %206, float undef)
+  %208 = fcmp ogt float undef, 0.000000e+00
+  %209 = fcmp olt float undef, 1.000000e+00
+  %210 = and i1 %208, %209
+  %211 = fcmp olt float undef, %207
+  %212 = and i1 %210, %211
+  br i1 %212, label %ENDIF2795, label %ELSE2797
+
+ELSE2584:                                         ; preds = %IF2565
+  br label %ENDIF2582
+
+ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
+  %213 = fadd float %1, undef
+  %214 = fadd float 0.000000e+00, %213
+  %215 = call float @llvm.AMDIL.fraction.(float %214)
+  br i1 undef, label %IF2589, label %ELSE2590
+
+IF2589:                                           ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ELSE2590:                                         ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
+  %216 = fsub float 1.000000e+00, %215
+  %217 = call float @llvm.sqrt.f32(float %216)
+  %218 = fmul float %217, undef
+  %219 = fadd float %218, undef
+  br label %ENDIF2564
+
+ELSE2593:                                         ; preds = %ELSE2566
+  %220 = fcmp oeq float %temp292.11, %82
+  %221 = fcmp olt float %82, %84
+  %222 = and i1 %220, %221
+  br i1 %222, label %ENDIF2594, label %ELSE2596
+
+ELSE2596:                                         ; preds = %ELSE2593
+  %223 = fcmp oeq float %temp292.11, %101
+  %224 = fcmp olt float %101, %103
+  %225 = and i1 %223, %224
+  br i1 %225, label %ENDIF2594, label %ELSE2632
+
+ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
+  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
+  %226 = fmul float %temp894.2, undef
+  br label %ENDIF2564
+
+ELSE2632:                                         ; preds = %ELSE2596
+  br i1 undef, label %ENDIF2594, label %ELSE2650
+
+ELSE2650:                                         ; preds = %ELSE2632
+  %227 = fcmp oeq float %temp292.11, %111
+  %228 = fcmp olt float %111, %112
+  %229 = and i1 %227, %228
+  br i1 %229, label %IF2667, label %ELSE2668
+
+IF2667:                                           ; preds = %ELSE2650
+  br i1 undef, label %ENDIF2594, label %ELSE2671
+
+ELSE2668:                                         ; preds = %ELSE2650
+  %230 = fcmp oeq float %temp292.11, %129
+  %231 = fcmp olt float %129, undef
+  %232 = and i1 %230, %231
+  br i1 %232, label %ENDIF2594, label %ELSE2686
+
+ELSE2671:                                         ; preds = %IF2667
+  br label %ENDIF2594
+
+ELSE2686:                                         ; preds = %ELSE2668
+  %233 = fcmp oeq float %temp292.11, %146
+  %234 = fcmp olt float %146, undef
+  %235 = and i1 %233, %234
+  br i1 %235, label %ENDIF2594, label %ELSE2704
+
+ELSE2704:                                         ; preds = %ELSE2686
+  %236 = fcmp oeq float %temp292.11, %181
+  %237 = fcmp olt float %181, undef
+  %238 = and i1 %236, %237
+  br i1 %238, label %ENDIF2594, label %ELSE2740
+
+ELSE2740:                                         ; preds = %ELSE2704
+  br i1 undef, label %IF2757, label %ELSE2758
+
+IF2757:                                           ; preds = %ELSE2740
+  br i1 undef, label %ENDIF2594, label %ELSE2761
+
+ELSE2758:                                         ; preds = %ELSE2740
+  br i1 undef, label %IF2775, label %ENDIF2594
+
+ELSE2761:                                         ; preds = %IF2757
+  br label %ENDIF2594
+
+IF2775:                                           ; preds = %ELSE2758
+  %239 = fcmp olt float undef, undef
+  br i1 %239, label %ENDIF2594, label %ELSE2779
+
+ELSE2779:                                         ; preds = %IF2775
+  br i1 undef, label %ENDIF2594, label %ELSE2782
+
+ELSE2782:                                         ; preds = %ELSE2779
+  br i1 undef, label %ENDIF2594, label %ELSE2785
+
+ELSE2785:                                         ; preds = %ELSE2782
+  %240 = fcmp olt float undef, 0.000000e+00
+  br i1 %240, label %ENDIF2594, label %ELSE2788
+
+ELSE2788:                                         ; preds = %ELSE2785
+  %241 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+  br label %ENDIF2594
+
+ELSE2797:                                         ; preds = %ENDIF2564
+  %242 = fsub float %8, undef
+  %243 = fsub float %9, undef
+  %244 = fsub float %10, undef
+  %245 = fmul float %242, undef
+  %246 = fmul float %243, undef
+  %247 = fmul float %244, undef
+  %248 = fsub float %11, undef
+  %249 = fsub float %12, undef
+  %250 = fsub float %13, undef
+  %251 = fmul float %248, undef
+  %252 = fmul float %249, undef
+  %253 = fmul float %250, undef
+  %254 = call float @llvm.minnum.f32(float %245, float %251)
+  %255 = call float @llvm.minnum.f32(float %246, float %252)
+  %256 = call float @llvm.maxnum.f32(float %247, float %253)
+  %257 = call float @llvm.maxnum.f32(float %254, float %255)
+  %258 = call float @llvm.maxnum.f32(float %257, float undef)
+  %259 = call float @llvm.minnum.f32(float undef, float %256)
+  %260 = fcmp ogt float %258, 0.000000e+00
+  %261 = fcmp olt float %258, 1.000000e+00
+  %262 = and i1 %260, %261
+  %263 = fcmp olt float %258, %259
+  %264 = and i1 %262, %263
+  br i1 %264, label %ENDIF2795, label %ELSE2800
+
+ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
+  br label %LOOP
+
+ELSE2800:                                         ; preds = %ELSE2797
+  br i1 undef, label %ENDIF2795, label %ELSE2803
+
+ELSE2803:                                         ; preds = %ELSE2800
+  %265 = fsub float %20, undef
+  %266 = fsub float %21, undef
+  %267 = fsub float %22, undef
+  %268 = fmul float %265, undef
+  %269 = fmul float %266, undef
+  %270 = fmul float %267, 0.000000e+00
+  %271 = fsub float %23, undef
+  %272 = fsub float %24, undef
+  %273 = fsub float %25, undef
+  %274 = fmul float %271, undef
+  %275 = fmul float %272, undef
+  %276 = fmul float %273, undef
+  %277 = call float @llvm.minnum.f32(float %268, float %274)
+  %278 = call float @llvm.maxnum.f32(float %269, float %275)
+  %279 = call float @llvm.maxnum.f32(float %270, float %276)
+  %280 = call float @llvm.maxnum.f32(float %277, float undef)
+  %281 = call float @llvm.maxnum.f32(float %280, float undef)
+  %282 = call float @llvm.minnum.f32(float undef, float %278)
+  %283 = call float @llvm.minnum.f32(float %282, float %279)
+  %284 = fcmp ogt float %281, 0.000000e+00
+  %285 = fcmp olt float %281, 1.000000e+00
+  %286 = and i1 %284, %285
+  %287 = fcmp olt float %281, %283
+  %288 = and i1 %286, %287
+  br i1 %288, label %ENDIF2795, label %ELSE2806
+
+ELSE2806:                                         ; preds = %ELSE2803
+  %289 = fsub float %26, undef
+  %290 = fsub float %27, undef
+  %291 = fsub float %28, undef
+  %292 = fmul float %289, undef
+  %293 = fmul float %290, 0.000000e+00
+  %294 = fmul float %291, undef
+  %295 = fsub float %29, undef
+  %296 = fmul float %295, undef
+  %297 = call float @llvm.minnum.f32(float %292, float %296)
+  %298 = call float @llvm.minnum.f32(float %293, float undef)
+  %299 = call float @llvm.maxnum.f32(float %294, float undef)
+  %300 = call float @llvm.maxnum.f32(float %297, float %298)
+  %301 = call float @llvm.maxnum.f32(float %300, float undef)
+  %302 = call float @llvm.minnum.f32(float undef, float %299)
+  %303 = fcmp ogt float %301, 0.000000e+00
+  %304 = fcmp olt float %301, 1.000000e+00
+  %305 = and i1 %303, %304
+  %306 = fcmp olt float %301, %302
+  %307 = and i1 %305, %306
+  br i1 %307, label %ENDIF2795, label %ELSE2809
+
+ELSE2809:                                         ; preds = %ELSE2806
+  br i1 undef, label %ENDIF2795, label %ELSE2812
+
+ELSE2812:                                         ; preds = %ELSE2809
+  br i1 undef, label %ENDIF2795, label %ELSE2815
+
+ELSE2815:                                         ; preds = %ELSE2812
+  br i1 undef, label %ENDIF2795, label %ELSE2818
+
+ELSE2818:                                         ; preds = %ELSE2815
+  br i1 undef, label %ENDIF2795, label %ELSE2821
+
+ELSE2821:                                         ; preds = %ELSE2818
+  %308 = fsub float %56, undef
+  %309 = fsub float %57, undef
+  %310 = fsub float %58, undef
+  %311 = fmul float %308, undef
+  %312 = fmul float %309, 0.000000e+00
+  %313 = fmul float %310, undef
+  %314 = fsub float %59, undef
+  %315 = fsub float %60, undef
+  %316 = fsub float %61, undef
+  %317 = fmul float %314, undef
+  %318 = fmul float %315, undef
+  %319 = fmul float %316, undef
+  %320 = call float @llvm.maxnum.f32(float %311, float %317)
+  %321 = call float @llvm.maxnum.f32(float %312, float %318)
+  %322 = call float @llvm.maxnum.f32(float %313, float %319)
+  %323 = call float @llvm.minnum.f32(float %320, float %321)
+  %324 = call float @llvm.minnum.f32(float %323, float %322)
+  %325 = fcmp ogt float undef, 0.000000e+00
+  %326 = fcmp olt float undef, 1.000000e+00
+  %327 = and i1 %325, %326
+  %328 = fcmp olt float undef, %324
+  %329 = and i1 %327, %328
+  br i1 %329, label %ENDIF2795, label %ELSE2824
+
+ELSE2824:                                         ; preds = %ELSE2821
+  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
+  br label %ENDIF2795
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.fraction.(float) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.minnum.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.maxnum.f32(float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
new file mode 100644
index 000000000000..5a6129aaa3fa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -0,0 +1,236 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.AMDGPU.barrier.local() #2
+
+
+@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
+@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
+@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
+
+; FUNC-LABEL: @reorder_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  call void @llvm.AMDGPU.barrier.local() #2
+  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Technically we could reorder these, but just comparing the
+; instruction type of the load is insufficient.
+
+; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_load_local_store_global_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2
+
+  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_local_offsets
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
+  store i32 123, i32 addrspace(3)* %ptr2, align 4
+  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
+  store i32 789, i32 addrspace(3)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(1)* %ptr1, align 4
+  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
+  store i32 123, i32 addrspace(1)* %ptr2, align 4
+  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
+  store i32 789, i32 addrspace(1)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
+; XCI: TBUFFER_STORE_FORMAT
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
+; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
+;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+
+;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+
+;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+;         i32 1, i32 0)
+
+;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+
+;   %add = add nsw i32 %tmp1, %tmp2
+
+;   store i32 %add, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind noduplicate }
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
new file mode 100644
index 000000000000..bd427dd3ed46
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+; CHECK: {{^}}test_8_min_char:
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; ModuleID = 'radeon'
+
+define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+entry:
+  %0 = load i8, i8 addrspace(1)* %in0, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1
+  %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %3 = insertelement <8 x i8> %1, i8 %2, i32 1
+  %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2
+  %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %5 = insertelement <8 x i8> %3, i8 %4, i32 2
+  %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3
+  %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %7 = insertelement <8 x i8> %5, i8 %6, i32 3
+  %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4
+  %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1
+  %9 = insertelement <8 x i8> undef, i8 %8, i32 0
+  %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5
+  %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %11 = insertelement <8 x i8> %9, i8 %10, i32 1
+  %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6
+  %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %13 = insertelement <8 x i8> %11, i8 %12, i32 2
+  %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7
+  %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %15 = insertelement <8 x i8> %13, i8 %14, i32 3
+  %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %16 = load i8, i8 addrspace(1)* %in1, align 1
+  %17 = insertelement <8 x i8> undef, i8 %16, i32 0
+  %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1
+  %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %19 = insertelement <8 x i8> %17, i8 %18, i32 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2
+  %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %21 = insertelement <8 x i8> %19, i8 %20, i32 2
+  %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3
+  %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %23 = insertelement <8 x i8> %21, i8 %22, i32 3
+  %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4
+  %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %25 = insertelement <8 x i8> undef, i8 %24, i32 0
+  %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5
+  %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %27 = insertelement <8 x i8> %25, i8 %26, i32 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6
+  %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %29 = insertelement <8 x i8> %27, i8 %28, i32 2
+  %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7
+  %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %31 = insertelement <8 x i8> %29, i8 %30, i32 3
+  %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
+  %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
+  %32 = extractelement <8 x i8> %cond.i, i32 0
+  store i8 %32, i8 addrspace(1)* %out, align 1
+  %33 = extractelement <8 x i8> %cond.i, i32 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
+  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
+  %34 = extractelement <8 x i8> %cond.i, i32 2
+  %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2
+  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
+  %35 = extractelement <8 x i8> %cond.i, i32 3
+  %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3
+  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
+  %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4
+  %36 = extractelement <8 x i8> %cond.i, i32 4
+  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
+  %37 = extractelement <8 x i8> %cond.i, i32 5
+  %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5
+  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
+  %38 = extractelement <8 x i8> %cond.i, i32 6
+  %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6
+  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
+  %39 = extractelement <8 x i8> %cond.i, i32 7
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7
+  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = !{null}
+!1 = !{null}
+!2 = !{null}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!4 = !{null}
+!5 = !{null}
+!6 = !{null}
+!7 = !{null}
+!8 = !{null}
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
new file mode 100644
index 000000000000..06bee114c23a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_sext_i1_to_i32:
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpg
+define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+entry:
+  %mul = mul i32 %a, %b
+  %add = add i32 %mul, %c
+  %sext = sext i32 %add to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i1_to_i64:
+; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
+; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
+; SI: s_endpgm
+define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %sext = sext i1 %cmp to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpgm
+define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+  %sext = sext i32 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}v_sext_i32_to_i64:
+; SI: v_ashr
+; SI: s_endpgm
+define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %val = load i32, i32 addrspace(1)* %in, align 4
+  %sext = sext i32 %val to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_sext_i16_to_i64:
+; SI: s_endpgm
+define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+  %sext = sext i16 %a to i64
+  store i64 %sext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll
new file mode 100644
index 000000000000..dffee70b6b02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll
@@ -0,0 +1,39 @@
+; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
+
+; 64-bit select was originally lowered with a build_pair, and this
+; could be simplified to 1 cndmask instead of 2, but that broken when
+; it started being implemented with a v2i32 build_vector and
+; bitcasting.
+define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %select to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: Fix truncating store for local memory
+; SI-LABEL: {{^}}trunc_load_alloca_i64:
+; SI: v_movrels_b32
+; SI-NOT: v_movrels_b32
+; SI: s_endpgm
+define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
+  %idx = add i32 %a, %b
+  %alloca = alloca i64, i32 4
+  %gep0 = getelementptr i64, i64* %alloca, i64 0
+  %gep1 = getelementptr i64, i64* %alloca, i64 1
+  %gep2 = getelementptr i64, i64* %alloca, i64 2
+  %gep3 = getelementptr i64, i64* %alloca, i64 3
+  store i64 24, i64* %gep0, align 8
+  store i64 9334, i64* %gep1, align 8
+  store i64 3935, i64* %gep2, align 8
+  store i64 9342, i64* %gep3, align 8
+  %gep = getelementptr i64, i64* %alloca, i32 %idx
+  %load = load i64, i64* %gep, align 8
+  %mask = and i64 %load, 4294967296
+  %add = add i64 %mask, -1
+  store i64 %add, i64 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
new file mode 100644
index 000000000000..da4e91db3a38
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
+; SI: v_cvt_f64_i32_e32
+define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; FIXME: select on 0, 0
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sint_to_fp_i64_to_f64
+define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %result = sitofp i64 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @v_sint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
+  %result = sitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
new file mode 100644
index 000000000000..8506441d1361
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
+define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = sitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = sitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
new file mode 100644
index 000000000000..b0c18ca5959c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+
+; SMRD load with an immediate offset.
+; GCN-LABEL: {{^}}smrd0:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset.
+; GCN-LABEL: {{^}}smrd1:
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate.
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
+define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with a 64-bit offset
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
+; SI: s_mov_b32 s[[SHI:[0-9]+]], 4
+; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; FIXME: We should be able to use s_load_dword here
+; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; TODO: Add VI checks
+; GCN: s_endpgm
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with an immediate offset
+; GCN-LABEL: {{^}}smrd_load_const0:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
+define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; SMRD load using the load.const intrinsic with the largest possible immediate
+; offset.
+; GCN-LABEL: {{^}}smrd_load_const1:
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+; SMRD load using the load.const intrinsic with an offset greater than the
+; largets possible immediate.
+; immediate offset.
+; GCN-LABEL: {{^}}smrd_load_const2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
new file mode 100644
index 000000000000..46409cdfae1c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; This is broken because the low half of the 64-bit add remains on the
+; SALU, but the upper half does not. The addc expects the carry bit
+; set in vcc, which is undefined since the low scalar half add sets
+; scc instead.
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, 399
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Doesn't use constants
+; FUNC-LABEL @imp_def_vcc_split_i64_add_2
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
+  %load = load i32, i32 addrspace(1)* %gep
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
new file mode 100644
index 000000000000..bcbc32f4c053
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -0,0 +1,213 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s
+
+;EG-LABEL: {{^}}ashr_v2i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_v2i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = ashr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v4i32:
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_v4i32:
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v4i32:
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = ashr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_i64:
+;EG: ASHR
+
+;SI-LABEL: {{^}}ashr_i64:
+;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+
+;VI-LABEL: {{^}}ashr_i64:
+;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+
+define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = ashr i64 %0, 8
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_i64_2:
+;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal
+;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+
+;SI-LABEL: {{^}}ashr_i64_2:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_i64_2:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1) * %in
+  %b = load i64, i64 addrspace(1) * %b_ptr
+  %result = ashr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v2i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v2i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v2i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr
+  %result = ashr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}ashr_v4i64:
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+;EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: LSHL {{.*}}, 1
+;EG-DAG: ASHR {{.*}}, [[SHA]]
+;EG-DAG: ASHR {{.*}}, [[SHB]]
+;EG-DAG: ASHR {{.*}}, [[SHC]]
+;EG-DAG: ASHR {{.*}}, [[SHD]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: LSHR {{.*}}, [[SHA]]
+;EG-DAG: LSHR {{.*}}, [[SHB]]
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: OR_INT
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: ASHR {{.*}}, literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+;EG-DAG: CNDE_INT
+
+;SI-LABEL: {{^}}ashr_v4i64:
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+;VI-LABEL: {{^}}ashr_v4i64:
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr
+  %result = ashr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
new file mode 100644
index 000000000000..c78fd549b316
--- /dev/null
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s
+
+define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in
+  %den = load i32, i32 addrspace(1) * %den_ptr
+  %result = srem i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = srem i32 %num, 4
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
+; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: s_endpgm
+define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = srem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
+  %result = srem <2 x i32> %num, %den
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %result = srem <2 x i32> %num, <i32 4, i32 4>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
+  %result = srem <4 x i32> %num, %den
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %num = load i64, i64 addrspace(1) * %in
+  %den = load i64, i64 addrspace(1) * %den_ptr
+  %result = srem i64 %num, %den
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %num = load i64, i64 addrspace(1) * %in
+  %result = srem i64 %num, 4
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
+  %result = srem <2 x i64> %num, %den
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
+  %result = srem <2 x i64> %num, <i64 4, i64 4>
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
+  %result = srem <4 x i64> %num, %den
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
+
+define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
+  %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
new file mode 100644
index 000000000000..4904d7fa1bd0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}lshr_i32:
+; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = lshr i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v2i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = lshr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v4i32:
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = lshr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
+; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
+; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
+; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]]
+; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}}
+; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}}
+; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
+; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
+; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
+define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = lshr i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v2i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = lshr <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lshr_v4i64:
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
+
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHA]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHB]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHC]]
+; EG-DAG: LSHL {{\*? *}}[[COMPSHD]]
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHL {{.*}}, 1
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: LSHR {{.*}}, [[SHA]]
+; EG-DAG: LSHR {{.*}}, [[SHB]]
+; EG-DAG: LSHR {{.*}}, [[SHC]]
+; EG-DAG: LSHR {{.*}}, [[SHD]]
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: OR_INT
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: ADD_INT  {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal
+; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT {{.*}}, 0.0
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = lshr <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
new file mode 100644
index 000000000000..26884a1b7761
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
+
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}ssubo_i64_zext:
+define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ssubo_i32:
+define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ssubo_i32:
+define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %ssub, 0
+  %carry = extractvalue { i32, i1 } %ssub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_ssubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ssubo_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %ssub, 0
+  %carry = extractvalue { i64, i1 } %ssub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
new file mode 100644
index 000000000000..4a72b4d090ad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck  --check-prefix=CHECK %s
+
+; This test is for a bug in the machine scheduler where stores without
+; an underlying object would be moved across the barrier.  In this
+; test, the <2 x i8> store will be split into two i8 stores, so they
+; won't have an underlying object.
+
+; CHECK-LABEL: {{^}}test:
+; CHECK: ds_write_b8
+; CHECK: ds_write_b8
+; CHECK: s_barrier
+; CHECK: s_endpgm
+; Function Attrs: nounwind
+define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) {
+bb:
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
+  %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
+  %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13
+  %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2
+  %tmp16 = add i32 %tmp13, 1
+  %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16
+  store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2
+  tail call void @llvm.AMDGPU.barrier.local() #2
+  %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4
+  %tmp26 = sext i32 %tmp25 to i64
+  %tmp27 = sext i32 %arg4 to i64
+  %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4
+  %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1
+  %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27
+  store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1
+  %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0
+  %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1
+  %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0
+  store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #2 = { noduplicate nounwind }
diff --git a/test/CodeGen/AMDGPU/store-v3i32.ll b/test/CodeGen/AMDGPU/store-v3i32.ll
new file mode 100644
index 000000000000..33617b55ed64
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-v3i32.ll
@@ -0,0 +1,13 @@
+; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; 3 vectors have the same size and alignment as 4 vectors, so this
+; should be done in a single store.
+
+; SI-LABEL: {{^}}store_v3i32:
+; SI: buffer_store_dwordx4
+define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+  store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
new file mode 100644
index 000000000000..e0c554ad2c17
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -0,0 +1,29 @@
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}global_store_v3i64:
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}global_store_v3i64_unaligned:
+define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}local_store_v3i64:
+define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
+  ret void
+}
+
+; SI-LABEL: {{^}}local_store_v3i64_unaligned:
+define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+  store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
new file mode 100644
index 000000000000..d5af3b29118a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s
+
+; This tests for a bug that caused a crash in
+; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
+; scratch loads and stores.
+; CHECK-LABEL: {{^}}store_vector_ptrs:
+define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+  %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  store <4 x i32*> %p, <4 x i32*>* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
new file mode 100644
index 000000000000..0f89405e073b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -0,0 +1,369 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+
+;===------------------------------------------------------------------------===;
+; Global Address Space
+;===------------------------------------------------------------------------===;
+; FUNC-LABEL: {{^}}store_i1:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
+define void @store_i1(i1 addrspace(1)* %out) {
+entry:
+  store i1 true, i1 addrspace(1)* %out
+  ret void
+}
+
+; i8 store
+; FUNC-LABEL: {{^}}store_i8:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+
+
+; IG 1: Truncate the calculated the shift amount for the mask
+
+; IG 2: Shift the value and the mask
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 255
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI: buffer_store_byte
+
+define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+entry:
+  store i8 %in, i8 addrspace(1)* %out
+  ret void
+}
+
+; i16 store
+; FUNC-LABEL: {{^}}store_i16:
+; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+
+; IG 0: Get the byte index and truncate the value
+
+
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 3(4.203895e-45),
+
+; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
+
+; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+; IG 1: Truncate the calculated the shift amount for the mask
+
+; IG 2: Shift the value and the mask
+; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
+; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-NEXT: 65535
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG: MOV T[[RW_GPR]].Y, 0.0
+; EG: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI: buffer_store_short
+define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+entry:
+  store i16 %in, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8:
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}store_v2i16:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; floating-point store
+; FUNC-LABEL: {{^}}store_f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
+
+; SI: buffer_store_dword
+
+define void @store_f32(float addrspace(1)* %out, float %in) {
+  store float %in, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i16:
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG: MEM_RAT MSKOR
+; EG-NOT: MEM_RAT MSKOR
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI-NOT: buffer_store_byte
+define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i16>
+  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; vec2 floating-point stores
+; FUNC-LABEL: {{^}}store_v2f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx2
+
+define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
+  store <2 x float> %1, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx4
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i8:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_byte
+define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i16:
+; EG: MEM_RAT MSKOR
+; SI: buffer_store_short
+define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; Local Address Space
+;===------------------------------------------------------------------------===;
+
+; FUNC-LABEL: {{^}}store_local_i1:
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
+define void @store_local_i1(i1 addrspace(3)* %out) {
+entry:
+  store i1 true, i1 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i8:
+; EG: LDS_BYTE_WRITE
+
+; SI: ds_write_b8
+define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+  store i8 %in, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i16:
+; EG: LDS_SHORT_WRITE
+
+; SI: ds_write_b16
+define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+  store i16 %in, i16 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v2i16:
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v4i8:
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v2i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+
+; SI: ds_write_b64
+define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_v4i32:
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+; EG: LDS_WRITE
+
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+; CM: LDS_WRITE
+
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i64_i8:
+; EG: LDS_BYTE_WRITE
+; SI: ds_write_b8
+define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}store_local_i64_i16:
+; EG: LDS_SHORT_WRITE
+; SI: ds_write_b16
+define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(3)* %out
+  ret void
+}
+
+; The stores in this function are combined by the optimizer to create a
+; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
+; should not try to split the 64-bit store back into 2 32-bit stores.
+;
+; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
+; be two 32-bit stores.
+
+; FUNC-LABEL: {{^}}vecload2:
+; EG: MEM_RAT_CACHELESS STORE_RAW
+
+; CM: MEM_RAT_CACHELESS STORE_DWORD
+
+; SI: buffer_store_dwordx2
+define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+entry:
+  %0 = load i32, i32 addrspace(2)* %mem, align 4
+  %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
+  %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: {{^}}"i128-const-store":
+; FIXME: We should be able to to this with one store instruction
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; EG: STORE_RAW
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; CM: STORE_DWORD
+; SI: buffer_store_dwordx4
+define void @i128-const-store(i32 addrspace(1)* %out) {
+entry:
+  store i32 1, i32 addrspace(1)* %out, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/store.r600.ll b/test/CodeGen/AMDGPU/store.r600.ll
new file mode 100644
index 000000000000..696fb033b5ec
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store.r600.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+
+; XXX: Merge this test into store.ll once it is supported on SI
+
+; v4i32 store
+; EG: {{^}}store_v4i32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; v4f32 store
+; EG: {{^}}store_v4f32:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %1 = load <4 x float>, <4 x float> addrspace(1) * %in
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
new file mode 100644
index 000000000000..02e592e9a559
--- /dev/null
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
+;
+;                            entry
+;                           /     \
+;               diamond_head       branch_from
+;                 /      \           |
+;    diamond_false        diamond_true
+;                 \      /
+;                   done
+;
+; When the diamond_true branch had more than 100 instructions.
+;
+;
+
+; CHECK-LABEL: {{^}}branch_into_diamond:
+; === entry block:
+; CHECK: ALU_PUSH_BEFORE
+; === Branch instruction (IF):
+; CHECK: JUMP
+  ; === branch_from block
+  ; CHECK: ALU
+  ; === Duplicated diamond_true block (There can be more than one ALU clause):
+  ; === XXX: We should be able to optimize this so the basic block is not
+  ; === duplicated.  See comments in
+  ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf()
+  ; CHECK: ALU
+; === Branch instruction (ELSE):
+; CHECK: ELSE
+  ; === diamond_head block:
+  ; CHECK: ALU_PUSH_BEFORE
+  ; === Branch instruction (IF):
+  ; CHECK: JUMP
+    ; === diamond_true block (There can be more than one ALU clause):
+    ; ALU
+  ; === Branch instruction (ELSE):
+  ; CHECK: ELSE
+    ; === diamond_false block plus implicit ENDIF
+    ; CHECK: ALU_POP_AFTER
+; === Branch instruction (ENDIF):
+; CHECK: POP
+; === done block:
+; CHECK: ALU
+; CHECK: MEM_RAT_CACHELESS
+; CHECK: CF_END
+
+
+define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+%0 = icmp ne i32 %a, 0
+  br i1 %0, label %diamond_head, label %branch_from
+
+diamond_head:
+  %1 = icmp ne i32 %a, 1
+  br i1 %1, label %diamond_true, label %diamond_false
+
+branch_from:
+  %2 = add i32 %a, 1
+  br label %diamond_true
+
+diamond_false:
+  %3 = add i32 %a, 2
+  br label %done
+
+diamond_true:
+  %4 = phi i32 [%2, %branch_from], [%a, %diamond_head]
+  ; This block needs to be > 100 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %a, %b
+  %div1 = udiv i32 %div0, %4
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  br label %done
+
+done:
+  %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
new file mode 100644
index 000000000000..77432c1f9d2b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+
+; This tests for abug where the AMDILCFGStructurizer was crashing on loops
+; like this:
+;
+; for (i = 0; i < x; i++) {
+;   if (cond0) {
+;     if (cond1) {
+;
+;     } else {
+;
+;     }
+;     if (cond2) {
+;
+;     }
+;   }
+; }
+
+; CHECK-LABEL: {{^}}if_inside_loop:
+; CHECK: LOOP_START_DX10
+; CHECK: END_LOOP
+define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%inc, %for.inc]
+  %val = phi i32 [0, %entry], [%val.for.inc, %for.inc]
+  %inc = add i32 %0, 1
+  %1 = icmp ult i32 10, %a
+  br i1 %1, label %for.inc, label %if.then
+
+if.then:
+  %2 = icmp ne i32 0, %b
+  br i1 %2, label %if.then.true, label %if.then.false
+
+if.then.true:
+  %3 = add i32 %a, %val
+  br label %if
+
+if.then.false:
+  %4 = mul i32 %a, %val
+  br label %if
+
+if:
+  %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false]
+  %5 = icmp ne i32 0, %c
+  br i1 %5, label %if.true, label %for.inc
+
+if.true:
+  %6 = add i32 %a, %val.if
+  br label %for.inc
+
+for.inc:
+  %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true]
+  %7 = icmp ne i32 0, %d
+  br i1 %7, label %for.body, label %exit
+
+exit:
+  store i32 %val.for.inc, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
new file mode 100644
index 000000000000..b7fba0efa5b2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -0,0 +1,130 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; FUNC-LABEL: {{^}}test_sub_i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = sub i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_sub_v2i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = sub <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v4i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = sub <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sub_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sub_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]]
+; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]]
+; EG-DAG: SUB_INT {{[* ]*}}[[LO]]
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}[[HI]]
+; EG-NOT: SUB
+define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v2i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = sub <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v4i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
+define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = sub <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
new file mode 100644
index 000000000000..c4dae4736cfa
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+
+; SI-LABEL:{{^}}row_filter_C1_D0:
+; SI: s_endpgm
+; Function Attrs: nounwind
+define void @row_filter_C1_D0() {
+entry:
+  br i1 undef, label %for.inc.1, label %do.body.preheader
+
+do.body.preheader:                                ; preds = %entry
+  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  br i1 undef, label %do.body56.1, label %do.body90
+
+do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
+  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
+  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
+  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  br i1 undef, label %do.body124.1, label %do.body.1562.preheader
+
+do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
+  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
+  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  br label %for.inc.1
+
+do.body56.1:                                      ; preds = %do.body.preheader
+  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %or.cond472.1 = or i1 undef, undef
+  br i1 %or.cond472.1, label %do.body56.2, label %do.body90
+
+do.body56.2:                                      ; preds = %do.body56.1
+  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  br label %do.body90
+
+do.body124.1:                                     ; preds = %do.body90
+  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  br label %do.body.1562.preheader
+
+for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
+  %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ]
+  %add.i495 = add <4 x i32> %storemerge591, undef
+  unreachable
+}
+
+; SI-LABEL: {{^}}foo:
+; SI: s_endpgm
+define void @foo() #0 {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb4, label %bb6
+
+bb2:                                              ; preds = %bb4, %bb
+  %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ]
+  br i1 undef, label %bb9, label %bb13
+
+bb4:                                              ; preds = %bb7, %bb6, %bb1
+  %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ]
+  br label %bb2
+
+bb6:                                              ; preds = %bb1
+  br i1 undef, label %bb7, label %bb4
+
+bb7:                                              ; preds = %bb6
+  %tmp8 = fmul float undef, undef
+  br label %bb4
+
+bb9:                                              ; preds = %bb2
+  %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = extractelement <4 x float> %tmp10, i32 3
+  br label %bb14
+
+bb13:                                             ; preds = %bb2
+  br i1 undef, label %bb23, label %bb24
+
+bb14:                                             ; preds = %bb27, %bb24, %bb9
+  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
+  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
+  %tmp17 = fmul float 10.5, %tmp16
+  %tmp18 = fmul float 11.5, %tmp15
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  ret void
+
+bb23:                                             ; preds = %bb13
+  br i1 undef, label %bb24, label %bb26
+
+bb24:                                             ; preds = %bb26, %bb23, %bb13
+  %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ]
+  br i1 undef, label %bb27, label %bb14
+
+bb26:                                             ; preds = %bb23
+  br label %bb24
+
+bb27:                                             ; preds = %bb24
+  br label %bb14
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
new file mode 100644
index 000000000000..8bd995a8ecbb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s
+; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges
+; properly.
+
+; Just make sure this test doesn't crash.
+; CHECK-LABEL: foobar:
+; CHECK: s_endpgm
+define void @foobar() {
+  %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
+  %v3 = sext <4 x i1> %v0 to <4 x i32>
+  %v4 = extractelement <4 x i32> %v3, i32 1
+  %v5 = icmp ne i32 %v4, 0
+  %v6 = select i1 %v5, i32 undef, i32 0
+  %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1
+  store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AMDGPU/swizzle-export.ll b/test/CodeGen/AMDGPU/swizzle-export.ll
new file mode 100644
index 000000000000..000ee2faa478
--- /dev/null
+++ b/test/CodeGen/AMDGPU/swizzle-export.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+
+;EG: {{^}}main:
+;EG: EXPORT T{{[0-9]+}}.XYXX
+;EG: EXPORT T{{[0-9]+}}.ZXXX
+;EG: EXPORT T{{[0-9]+}}.XXWX
+;EG: EXPORT T{{[0-9]+}}.XXXW
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float>, <4 x float> addrspace(8)* null
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = load <4 x float>, <4 x float> addrspace(8)* null
+  %7 = extractelement <4 x float> %6, i32 2
+  %8 = load <4 x float>, <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = fmul float 0.000000e+00, %9
+  %11 = load <4 x float>, <4 x float> addrspace(8)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %5, %12
+  %14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %15 = extractelement <4 x float> %14, i32 0
+  %16 = fmul float 0.000000e+00, %15
+  %17 = load <4 x float>, <4 x float> addrspace(8)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float 0.000000e+00, %18
+  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fmul float %7, %21
+  %23 = load <4 x float>, <4 x float> addrspace(8)* null
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float 0.000000e+00, %24
+  %26 = load <4 x float>, <4 x float> addrspace(8)* null
+  %27 = extractelement <4 x float> %26, i32 0
+  %28 = fmul float 0.000000e+00, %27
+  %29 = load <4 x float>, <4 x float> addrspace(8)* null
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float 0.000000e+00, %30
+  %32 = load <4 x float>, <4 x float> addrspace(8)* null
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float 0.000000e+00, %33
+  %35 = load <4 x float>, <4 x float> addrspace(8)* null
+  %36 = extractelement <4 x float> %35, i32 0
+  %37 = fmul float 0.000000e+00, %36
+  %38 = load <4 x float>, <4 x float> addrspace(8)* null
+  %39 = extractelement <4 x float> %38, i32 0
+  %40 = fmul float 1.000000e+00, %39
+  %41 = load <4 x float>, <4 x float> addrspace(8)* null
+  %42 = extractelement <4 x float> %41, i32 0
+  %43 = fmul float 0.000000e+00, %42
+  %44 = load <4 x float>, <4 x float> addrspace(8)* null
+  %45 = extractelement <4 x float> %44, i32 0
+  %46 = fmul float 0.000000e+00, %45
+  %47 = load <4 x float>, <4 x float> addrspace(8)* null
+  %48 = extractelement <4 x float> %47, i32 0
+  %49 = fmul float 0.000000e+00, %48
+  %50 = load <4 x float>, <4 x float> addrspace(8)* null
+  %51 = extractelement <4 x float> %50, i32 0
+  %52 = fmul float 0.000000e+00, %51
+  %53 = load <4 x float>, <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = fmul float 1.000000e+00, %54
+  %56 = insertelement <4 x float> undef, float %0, i32 0
+  %57 = insertelement <4 x float> %56, float %1, i32 1
+  %58 = insertelement <4 x float> %57, float %2, i32 2
+  %59 = insertelement <4 x float> %58, float %3, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
+  %60 = insertelement <4 x float> undef, float %10, i32 0
+  %61 = insertelement <4 x float> %60, float %13, i32 1
+  %62 = insertelement <4 x float> %61, float %16, i32 2
+  %63 = insertelement <4 x float> %62, float %19, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
+  %64 = insertelement <4 x float> undef, float %22, i32 0
+  %65 = insertelement <4 x float> %64, float %25, i32 1
+  %66 = insertelement <4 x float> %65, float %28, i32 2
+  %67 = insertelement <4 x float> %66, float %31, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
+  %68 = insertelement <4 x float> undef, float %34, i32 0
+  %69 = insertelement <4 x float> %68, float %37, i32 1
+  %70 = insertelement <4 x float> %69, float %40, i32 2
+  %71 = insertelement <4 x float> %70, float %43, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
+  %72 = insertelement <4 x float> undef, float %46, i32 0
+  %73 = insertelement <4 x float> %72, float %49, i32 1
+  %74 = insertelement <4 x float> %73, float %52, i32 2
+  %75 = insertelement <4 x float> %74, float %55, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
+  ret void
+}
+
+; EG: {{^}}main2:
+; EG: T{{[0-9]+}}.XY__
+; EG: T{{[0-9]+}}.ZXY0
+
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = fadd float %0, 2.5
+  %3 = fmul float %1, 3.5
+  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = call float @llvm.cos.f32(float %5)
+  %7 = load <4 x float>, <4 x float> addrspace(8)* null
+  %8 = extractelement <4 x float> %7, i32 0
+  %9 = load <4 x float>, <4 x float> addrspace(8)* null
+  %10 = extractelement <4 x float> %9, i32 1
+  %11 = insertelement <4 x float> undef, float %2, i32 0
+  %12 = insertelement <4 x float> %11, float %3, i32 1
+  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+  %13 = insertelement <4 x float> undef, float %6, i32 0
+  %14 = insertelement <4 x float> %13, float %8, i32 1
+  %15 = insertelement <4 x float> %14, float %10, i32 2
+  %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.cos.f32(float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
new file mode 100644
index 000000000000..cbb9c50974a4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/tex-clause-antidep.ll
@@ -0,0 +1,25 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX
+;CHECK-NEXT: ALU
+
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
+  %5 = insertelement <4 x float> undef, float %1, i32 0
+  %6 = insertelement <4 x float> %5, float %2, i32 1
+  %7 = insertelement <4 x float> %6, float %3, i32 2
+  %8 = insertelement <4 x float> %7, float %4, i32 3
+  %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %11 = fadd <4 x float> %9, %10
+  call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/texture-input-merge.ll b/test/CodeGen/AMDGPU/texture-input-merge.ll
new file mode 100644
index 000000000000..789538af5821
--- /dev/null
+++ b/test/CodeGen/AMDGPU/texture-input-merge.ll
@@ -0,0 +1,31 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: MOV
+
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
+  %5 = fmul float %1, 3.0
+  %6 = fmul float %2, 3.0
+  %7 = fmul float %3, 3.0
+  %8 = fmul float %4, 3.0
+  %9 = insertelement <4 x float> undef, float %5, i32 0
+  %10 = insertelement <4 x float> %9, float %6, i32 1
+  %11 = insertelement <4 x float> undef, float %7, i32 0
+  %12 = insertelement <4 x float> %11, float %5, i32 1
+  %13 = insertelement <4 x float> undef, float %8, i32 0
+  %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %17 = fadd <4 x float> %14, %15
+  %18 = fadd <4 x float> %17, %16
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
new file mode 100644
index 000000000000..dac74728b3ce
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}}
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_byte
+define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: The negate should be inverting the compare.
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp eq i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 0
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1:
+; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
+; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
+; SI-NEXT: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, 1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: This should be one compare.
+; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1:
+; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]]
+; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
+; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
+; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
+; XSI-NEXT: buffer_store_byte [[RESULT]]
+define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
+; SI: buffer_store_byte [[RESULT]]
+define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1, i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
+; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NEXT: buffer_store_byte
+define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8, i8 addrspace(1)* %in
+  %masked = and i8 %load, 255
+  %ext = sext i8 %masked to i32
+  %cmp = icmp ne i32 %ext, -1
+  store i1 %cmp, i1 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
new file mode 100644
index 000000000000..c29872beef86
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -0,0 +1,56 @@
+; XFAIL: *
+; RUN: llc -march=amdgcn -mcpu=SI < %s
+
+; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
+; GCN: s_endpgm
+define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %val = load double, double addrspace(1)* %in
+  %cvt = fptrunc double %val to half
+  store half %cvt, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
+; GCN: s_endpgm
+define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+  %val = load <2 x double>, <2 x double> addrspace(1)* %in
+  %cvt = fptrunc <2 x double> %val to <2 x half>
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
+; GCN: s_endpgm
+define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+  %val = load <3 x double>, <3 x double> addrspace(1)* %in
+  %cvt = fptrunc <3 x double> %val to <3 x half>
+  store <3 x half> %cvt, <3 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
+; GCN: s_endpgm
+define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+  %val = load <4 x double>, <4 x double> addrspace(1)* %in
+  %cvt = fptrunc <4 x double> %val to <4 x half>
+  store <4 x half> %cvt, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
+; GCN: s_endpgm
+define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+  %val = load <8 x double>, <8 x double> addrspace(1)* %in
+  %cvt = fptrunc <8 x double> %val to <8 x half>
+  store <8 x half> %cvt, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
+; GCN: s_endpgm
+define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+  %val = load <16 x double>, <16 x double> addrspace(1)* %in
+  %cvt = fptrunc <16 x double> %val to <16 x half>
+  store <16 x half> %cvt, <16 x half> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll
new file mode 100644
index 000000000000..b71a838b62cd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
+define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+  %trunc = trunc i32 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
+; SI: buffer_store_byte
+define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+  %trunc = trunc i64 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
+define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+  %trunc = trunc i16 %val to i1
+  store i1 %trunc, i1 addrspace(1)* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
new file mode 100644
index 000000000000..878ea3f48995
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests for a bug in the SelectionDAG where custom lowered truncated
+; vector stores at the end of a basic block were not being added to the
+; LegalizedNodes list, which triggered an assertion failure.
+
+; CHECK-LABEL: {{^}}test:
+; CHECK: MEM_RAT_CACHELESS STORE_RAW
+define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %done
+
+if:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
new file mode 100644
index 000000000000..bf690ca4cb28
--- /dev/null
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+; SI-LABEL: {{^}}trunc_i64_to_i32_store:
+; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
+; SI: buffer_store_dword [[VLOAD]]
+
+; EG-LABEL: {{^}}trunc_i64_to_i32_store:
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG: LSHR
+; EG-NEXT: 2(
+
+  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_load_shl_i64:
+; SI-DAG: s_load_dwordx2
+; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
+; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
+; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: buffer_store_dword [[VSHL]],
+define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+  %b = shl i64 %a, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_shl_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; SI: s_addc_u32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32
+; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: buffer_store_dword v[[LO_VREG]],
+define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+  %aa = add i64 %a, 234 ; Prevent shrinking store.
+  %b = shl i64 %aa, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits
+  ret void
+}
+
+; SI-LABEL: {{^}}trunc_i32_to_i1:
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+  %a = load i32, i32 addrspace(1)* %ptr, align 4
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
+; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}s_trunc_i64_to_i1:
+; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}v_trunc_i64_to_i1:
+; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
+; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]]
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  %x = load i64, i64 addrspace(1)* %gep
+
+  %trunc = trunc i64 %x to i1
+  %sel = select i1 %trunc, i32 63, i32 -12
+  store i32 %sel, i32 addrspace(1)* %out.gep
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
new file mode 100644
index 000000000000..76c32afc1f21
--- /dev/null
+++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -0,0 +1,58 @@
+; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s
+
+; This IR comes from this OpenCL C code:
+;
+; if (b + 4 > a) {
+;   for (int i = 0; i < 4; i++, b++) {
+;     if (b + 1 <= a)
+;       *(dst + c + b) = 0;
+;     else
+;       break;
+;   }
+; }
+;
+; This test is meant to check that this loop isn't unrolled into more than
+; four iterations.  The loop unrolling preferences we currently use cause this
+; loop to not be unrolled at all, but that may change in the future.
+
+; CHECK-LABEL: @test
+; CHECK: store i8 0, i8 addrspace(1)*
+; CHECK-NOT: store i8 0, i8 addrspace(1)*
+; CHECK: ret void
+define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+entry:
+  %add = add nsw i32 %b, 4
+  %cmp = icmp sgt i32 %add, %a
+  br i1 %cmp, label %for.cond.preheader, label %if.end7
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp313 = icmp slt i32 %b, %a
+  br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit
+
+if.then4.lr.ph:                                   ; preds = %for.cond.preheader
+  %0 = sext i32 %c to i64
+  br label %if.then4
+
+if.then4:                                         ; preds = %if.then4.lr.ph, %if.then4
+  %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ]
+  %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ]
+  %add2 = add nsw i32 %b.addr.014, 1
+  %1 = sext i32 %b.addr.014 to i64
+  %add.ptr.sum = add nsw i64 %1, %0
+  %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum
+  store i8 0, i8 addrspace(1)* %add.ptr5, align 1
+  %inc = add nsw i32 %i.015, 1
+  %cmp1 = icmp slt i32 %inc, 4
+  %cmp3 = icmp slt i32 %add2, %a
+  %or.cond = and i1 %cmp3, %cmp1
+  br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge
+
+for.cond.if.end7.loopexit_crit_edge:              ; preds = %if.then4
+  br label %if.end7.loopexit
+
+if.end7.loopexit:                                 ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end7.loopexit, %entry
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
new file mode 100644
index 000000000000..11438f267ad0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}uaddo_i64_zext:
+; SI: add
+; SI: addc
+; SI: addc
+
+; EG: ADDC_UINT
+; EG: ADDC_UINT
+define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_uaddo_i32:
+; SI: s_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uaddo_i32:
+; SI: v_add_i32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %uadd, 0
+  %carry = extractvalue { i32, i1 } %uadd, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_uaddo_i64:
+; SI: s_add_u32
+; SI: s_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_uaddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
+
+; EG: ADDC_UINT
+; EG: ADD_INT
+define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
new file mode 100644
index 000000000000..de22a22e5029
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -0,0 +1,48 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+;EG-LABEL: {{^}}test:
+;EG-NOT: SETGE_INT
+;EG: CF_END
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1) * %in
+  %b = load i32, i32 addrspace(1) * %b_ptr
+  %result = udiv i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;The code generated by udiv is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 udiv
+
+;EG-LABEL: {{^}}test2:
+;EG: CF_END
+;SI-LABEL: {{^}}test2:
+;SI: s_endpgm
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = udiv <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-LABEL: {{^}}test4:
+;EG: CF_END
+;SI-LABEL: {{^}}test4:
+;SI: s_endpgm
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = udiv <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
new file mode 100644
index 000000000000..b3837f28209a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -0,0 +1,345 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_udivrem:
+; EG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG: CNDE_INT
+; EG: MULHI
+; EG: MULLO_INT
+; EG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
+; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
+; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
+; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
+; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
+  %result0 = udiv i32 %x, %y
+  store i32 %result0, i32 addrspace(1)* %out
+  %result1 = urem i32 %x, %y
+  store i32 %result1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udivrem_v2:
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+  %result0 = udiv <2 x i32> %x, %y
+  store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
+  %result1 = urem <2 x i32> %x, %y
+  store <2 x i32> %result1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}test_udivrem_v4:
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: RECIP_UINT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: MULHI
+; EG-DAG: MULLO_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SETGE_UINT
+; EG-DAG: SETGE_UINT
+; EG-DAG: AND_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: ADD_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
+; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}}
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
+define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+  %result0 = udiv <4 x i32> %x, %y
+  store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
+  %result1 = urem <4 x i32> %x, %y
+  store <4 x i32> %result1, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
new file mode 100644
index 000000000000..4de881b66f10
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}udiv24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = udiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = udiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_cvt_f32_u32
+; SI-DAG: v_cvt_f32_u32
+; SI-DAG: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %num = load i8, i8 addrspace(1) * %in
+  %den = load i8, i8 addrspace(1) * %den_ptr
+  %result = urem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
+  %num = load i16, i16 addrspace(1) * %in, align 2
+  %den = load i16, i16 addrspace(1) * %den_ptr, align 2
+  %result = urem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i32:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %num = load i32, i32 addrspace(1) * %in, align 4
+  %den = load i32, i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
new file mode 100644
index 000000000000..9f3069bdf80c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -0,0 +1,223 @@
+;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: {{^}}test_udiv:
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = udiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem:
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem3264:
+;EG: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;GCN-NOT: s_bfe_u32
+;GCN-NOT: v_mad_f32
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: s_endpgm
+define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 33
+  %2 = lshr i64 %y, 33
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_udiv2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = udiv i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: {{^}}test_urem2464:
+;EG: UINT_TO_FLT
+;EG: UINT_TO_FLT
+;EG: FLT_TO_UINT
+;EG-NOT: RECIP_UINT
+;EG-NOT: BFE_UINT
+
+;SI-NOT: v_lshr_b64
+;VI-NOT: v_lshrrev_b64
+;GCN: v_mad_f32
+;GCN: s_endpgm
+define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %1 = lshr i64 %x, 40
+  %2 = lshr i64 %y, 40
+  %result = urem i64 %1, %2
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
new file mode 100644
index 000000000000..dfec8eb15cb7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -0,0 +1,98 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
+  %val = load i64, i64 addrspace(1)* %gep, align 8
+  %result = uitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
+define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %cast = uitofp i64 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
+define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+  %cast = uitofp <2 x i64> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
+define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+  %cast = uitofp <4 x i64> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+  %cast = uitofp <2 x i32> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
+define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+  %cast = uitofp <4 x i32> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: select on 0, 0
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
new file mode 100644
index 000000000000..00fea80b1bc8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32:
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = uitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32:
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = uitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32:
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
+define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %result = uitofp <4 x i32> %value to <4 x float>
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32:
+; R600: UINT_TO_FLT
+; R600: UINT_TO_FLT
+; R600: MULADD_IEEE
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000
+; SI: s_endpgm
+define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = uitofp i64 %in to float
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = uitofp i1 %cmp to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) {
+  %fp = uitofp i1 %in to float
+  store float %fp, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
new file mode 100644
index 000000000000..82d88ebd3ae7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -0,0 +1,254 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}unaligned_load_store_i16_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind {
+  %v = load i16, i16 addrspace(3)* %p, align 1
+  store i16 %v, i16 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i16_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind {
+  %v = load i16, i16 addrspace(1)* %p, align 1
+  store i16 %v, i16 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+  %v = load i32, i32 addrspace(3)* %p, align 1
+  store i32 %v, i32 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i32_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind {
+  %v = load i32, i32 addrspace(1)* %p, align 1
+  store i32 %v, i32 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) {
+  %v = load i64, i64 addrspace(3)* %p, align 1
+  store i64 %v, i64 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_i64_global:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) {
+  %v = load i64, i64 addrspace(1)* %p, align 1
+  store i64 %v, i64 addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}unaligned_load_store_v4i32_local:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: s_endpgm
+define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
+  ret void
+}
+
+; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded.
+; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+; FIXME-SI: buffer_load_ubyte
+define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind {
+  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4:
+; SI: ds_read2_b32
+; SI: s_endpgm
+define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64, i64 addrspace(3)* %in, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
+  %val = load i64, i64 addrspace(3)* %ptr, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  %val = load i64, i64 addrspace(3)* %ptri64, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_1:
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+
+define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64, i64 addrspace(3)* %in, align 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4:
+; SI: ds_write2_b32
+; SI: s_endpgm
+define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+  store i64 %val, i64 addrspace(3)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
+  store i64 0, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
+  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  store i64 0, i64 addrspace(3)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
new file mode 100644
index 000000000000..036a7e91b47f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -0,0 +1,115 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
+; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s
+
+; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
+
+; COMMON-LABEL: {{^}}branch_true:
+define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 true, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: {{^}}branch_false:
+; SI: .text
+; SI-NEXT: s_endpgm
+define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 false, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; COMMON-LABEL: {{^}}branch_undef:
+; SI: .text
+; SI-NEXT: s_endpgm
+define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+entry:
+  br i1 undef, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add.ptr.sum = shl i32 %main_stride, 1
+  %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride
+  %add.ptr4.sum = shl i32 %main_stride, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
+  %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)*
+  %1 = load i32, i32 addrspace(1)* %0, align 4
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride
+  %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %3 = load i32, i32 addrspace(1)* %2, align 4
+  %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum
+  %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  %5 = load i32, i32 addrspace(1)* %4, align 4
+  %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum
+  %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)*
+  %7 = load i32, i32 addrspace(1)* %6, align 4
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum
+  %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)*
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll
new file mode 100644
index 000000000000..411a15a4b839
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unroll.ll
@@ -0,0 +1,36 @@
+; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s
+
+
+; This test contains a simple loop that initializes an array declared in
+; private memory.  We want to make sure these kinds of loops are always
+; unrolled, because private memory is slow.
+
+; CHECK-LABEL: @test
+; CHECK-NOT: alloca
+; CHECK: store i32 5, i32 addrspace(1)* %out
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  %0 = alloca [32 x i32]
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter
+  store i32 %counter, i32* %ptr
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %1 = icmp sge i32 %counter, 32
+  br i1 %1, label  %exit, label %loop.header
+
+exit:
+  %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5
+  %3 = load i32, i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
new file mode 100644
index 000000000000..8ab4faf2f145
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests are for condition codes that are not supported by the hardware
+
+; CHECK-LABEL: {{^}}slt:
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5(7.006492e-45)
+define void @slt(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp slt i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_i32:
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 5(7.006492e-45)
+define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ult i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_float:
+; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
+; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
+define void @ult_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ult_float_native:
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ult_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}olt:
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @olt(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}sle:
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 6(8.407791e-45)
+define void @sle(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sle i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_i32:
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
+; CHECK-NEXT: 6(8.407791e-45)
+define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ule i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_float:
+; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
+; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
+define void @ule_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ule_float_native:
+; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ule_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ole:
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT:1084227584(5.000000e+00)
+define void @ole(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll
new file mode 100644
index 000000000000..daacc771708a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/urecip.ll
@@ -0,0 +1,13 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK: v_rcp_iflag_f32_e32
+
+define void @test(i32 %p, i32 %q) {
+   %i = udiv i32 %p, %q
+   %r = bitcast i32 %i to float
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
new file mode 100644
index 000000000000..62841ec2d6c5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; The code generated by urem is long and complex and may frequently
+; change.  The goal of this test is to make sure the ISel doesn't fail
+; when it gets a v2i32/v4i32 urem
+
+; FUNC-LABEL: {{^}}test_urem_i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = urem i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_i32_7:
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
+; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
+; SI: v_subrev_i32
+; SI: v_mul_lo_i32
+; SI: v_sub_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %num = load i32, i32 addrspace(1) * %in
+  %result = urem i32 %num, 7
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v2i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = urem <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v4i32:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = urem <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
+  %a = load i64, i64 addrspace(1)* %in
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = urem i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v2i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = urem <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_urem_v4i64:
+; SI: s_endpgm
+; EG: CF_END
+define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = urem <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
new file mode 100644
index 000000000000..f26f30022b4f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_binop:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+  %dbl = fadd float %a, %a
+  store float %dbl, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; GCN: s_load_dword [[SGPR:s[0-9]+]],
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
+; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
+; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; GCN: s_load_dword [[SGPR:s[0-9]+]]
+; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; GCN: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
+  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
+  store i32 %fma, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
new file mode 100644
index 000000000000..3c9b1622a076
--- /dev/null
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+
+; FUNC-LABEL: {{^}}usubo_i64_zext:
+
+; EG: SUBB_UINT
+; EG: ADDC_UINT
+define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_usubo_i32:
+; SI: s_sub_i32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_usubo_i32:
+; SI: v_subrev_i32_e32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
+  %val = extractvalue { i32, i1 } %usub, 0
+  %carry = extractvalue { i32, i1 } %usub, 1
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_usubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
+define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_usubo_i64:
+; SI: v_sub_i32
+; SI: v_subb_u32
+
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG: SUB_INT
+define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64, i64 addrspace(1)* %aptr, align 4
+  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %usub, 0
+  %carry = extractvalue { i64, i1 } %usub, 1
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  store i1 %carry, i1 addrspace(1)* %carryout
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
new file mode 100644
index 000000000000..31755125c03b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -0,0 +1,17 @@
+; REQUIRES: asserts
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}kernel_arg_i64:
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; i64 arg works, v1i64 arg does not.
+; CHECK-LABEL: {{^}}kernel_arg_v1i64:
+define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+  store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
new file mode 100644
index 000000000000..c368c5aaf7dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+  %idx = call i32 @llvm.r600.read.tidig.x() #1
+  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
+  %f = load float, float addrspace(1)* %fptr
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+
+; This requires slightly trickier SGPR operand legalization since the
+; single constant bus SGPR usage is the last operand, and it should
+; never be moved.
+
+; SI-LABEL: {{^}}v_cnd_nan:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
new file mode 100644
index 000000000000..7d0ebd139f51
--- /dev/null
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -0,0 +1,188 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: @test_if
+; Make sure the i1 values created by the cfg structurizer pass are
+; moved using VALU instructions
+; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI: v_mov_b32_e32 v{{[0-9]}}, -1
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+entry:
+  switch i32 %a, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case0:
+  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+  br label %end
+
+case1:
+  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %end
+
+default:
+  %cmp8 = icmp eq i32 %a, 2
+  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  br i1 %cmp8, label %if, label %else
+
+if:
+  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+else:
+  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+end:
+  ret void
+}
+
+; SI-LABEL: @simple_test_v_if
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+
+; SI: ; BB#1
+; SI: buffer_store_dword
+; SI: s_endpgm
+
+; SI: BB1_2:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %store, label %exit
+
+store:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @simple_test_v_loop
+; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: s_cbranch_execz BB2_2
+
+; SI: ; BB#1:
+; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; SI: BB2_3:
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: v_cmp_eq_i32_e32 vcc,
+; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
+; SI: s_cbranch_execnz BB2_3
+
+define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %is.0 = icmp ne i32 %tid, 0
+  %limit = add i32 %tid, 64
+  br i1 %is.0, label %loop, label %exit
+
+loop:
+  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
+  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
+  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
+  %load = load i32, i32 addrspace(1)* %src
+  store i32 %load, i32 addrspace(1)* %gep.dst
+  %i.inc = add nsw i32 %i, 1
+  %cmp = icmp eq i32 %limit, %i.inc
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SI-LABEL: @multi_vcond_loop
+
+; Load loop limit from buffer
+; Branch to exit if uniformly not taken
+; SI: ; BB#0:
+; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
+; SI: v_cmp_lt_i32_e32 vcc
+; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
+; SI: s_cbranch_execz BB3_2
+
+; Initialize inner condition to false
+; SI: ; BB#1:
+; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
+
+; Clear exec bits for workitems that load -1s
+; SI: BB3_3:
+; SI: buffer_load_dword [[B:v[0-9]+]]
+; SI: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
+; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
+; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
+; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
+; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
+; SI: s_cbranch_execz BB3_5
+
+; SI: BB#4:
+; SI: buffer_store_dword
+; SI: v_cmp_ge_i64_e32 vcc
+; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
+
+; SI: BB3_5:
+; SI: s_or_b64 exec, exec, [[ORNEG1]]
+; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
+; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI: s_cbranch_execnz BB3_3
+
+; SI: BB#6
+; SI: s_or_b64 exec, exec, [[COND_STATE]]
+
+; SI: BB3_2:
+; SI-NOT: [[COND_STATE]]
+; SI: s_endpgm
+
+define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+bb:
+  %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tmp4 = sext i32 %tmp to i64
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp7 = icmp sgt i32 %tmp6, 0
+  %tmp8 = sext i32 %tmp6 to i64
+  br i1 %tmp7, label %bb10, label %bb26
+
+bb10:                                             ; preds = %bb, %bb20
+  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
+  %tmp12 = add nsw i64 %tmp11, %tmp4
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
+  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
+  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+  %tmp17 = icmp ne i32 %tmp14, -1
+  %tmp18 = icmp ne i32 %tmp16, -1
+  %tmp19 = and i1 %tmp17, %tmp18
+  br i1 %tmp19, label %bb20, label %bb26
+
+bb20:                                             ; preds = %bb10
+  %tmp21 = add nsw i32 %tmp16, %tmp14
+  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
+  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
+  %tmp23 = add nuw nsw i64 %tmp11, 1
+  %tmp24 = icmp slt i64 %tmp23, %tmp8
+  br i1 %tmp24, label %bb10, label %bb26
+
+bb26:                                             ; preds = %bb10, %bb20, %bb
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
new file mode 100644
index 000000000000..6f3b4847fbdf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}vector_read:
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index
+  %2 = load i32, i32* %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_write:
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOV
+; EG: MOVA_INT
+; EG: MOVA_INT
+define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index
+  store i32 1, i32* %1
+  %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index
+  %3 = load i32, i32* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; This test should be optimize to:
+; store i32 0, i32 addrspace(1)* %out
+; FUNC-LABEL: {{^}}bitcast_gep:
+; EG: STORE_RAW
+define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %0 = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  store i32 0, i32* %z
+  store i32 0, i32* %w
+  %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1
+  %2 = bitcast i32* %1 to [4 x i32]*
+  %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0
+  %4 = load i32, i32* %3
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
new file mode 100644
index 000000000000..fb6a17e67146
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s
+
+; NI: {{^}}vtx_fetch32:
+; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
+; CM: {{^}}vtx_fetch32:
+; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
+
+define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; NI: {{^}}vtx_fetch128:
+; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
+; XXX: Add a case for Cayman when v4i32 stores are supported.
+
+define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
new file mode 100644
index 000000000000..9b2f229c05af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test that we correctly commute a sub instruction
+; FUNC-LABEL: {{^}}sub_rev:
+; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
+; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
+
+; ModuleID = 'vop-shrink.ll'
+
+define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+entry:
+  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %tmp2 = extractelement <4 x i32> %sgpr, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  br label %endif
+
+else:                                             ; preds = %entry
+  %tmp3 = extractelement <4 x i32> %sgpr, i32 2
+  %tmp4 = sub i32 %vgpr, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+; Test that we fold an immediate that was illegal for a 64-bit op into the
+; 32-bit op when we shrink it.
+
+; FUNC-LABEL: {{^}}add_fold:
+; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
+define void @add_fold(float addrspace(1)* %out) {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = uitofp i32 %tmp to float
+  %tmp2 = fadd float %tmp1, 1.024000e+03
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
new file mode 100644
index 000000000000..a3014b03d2b3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -0,0 +1,77 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+
+;EG: {{^}}test_select_v2i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v2i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
+  %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <2 x i32> %0, %1
+  %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v2f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v2f32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
+  %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
+  %cmp = fcmp une <2 x float> %0, %1
+  %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v4i32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI: {{^}}test_select_v4i32:
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+;SI: v_cndmask_b32_e64
+
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <4 x i32> %0, %1
+  %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG: {{^}}test_select_v4f32:
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
+  %cmp = fcmp une <4 x float> %0, %1
+  %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll
new file mode 100644
index 000000000000..ef85ebe7899f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vselect64.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
+; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
+
+; CHECK-LABEL: {{^}}test_select_v4i64:
+; Make sure the vectors aren't being stored on the stack.  We know they are
+; being stored on the stack if the shaders uses at leat 10 registers.
+; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
+define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+entry:
+       %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
+       %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
+       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+       ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
new file mode 100644
index 000000000000..4584d6e25254
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s
+
+; This tests for a bug where vertex fetch clauses right before an ENDIF
+; instruction where being emitted after the ENDIF.  We were using ALU_POP_AFTER
+; for the ALU clause before the vetex fetch instead of emitting a POP instruction
+; after the fetch clause.
+
+
+; CHECK-LABEL: {{^}}test:
+; CHECK-NOT: ALU_POP_AFTER
+; CHECK: TEX
+; CHECK-NEXT: POP
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %endif, label %if
+
+if:
+  %1 = load i32, i32 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %x = phi i32 [ %1, %if], [ 0, %entry]
+  store i32 %x, i32 addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll
new file mode 100644
index 000000000000..912e258ebb83
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test is for a scheduler bug where VTX_READ instructions that used
+; the result of another VTX_READ instruction were being grouped in the
+; same fetch clasue.
+
+; CHECK: {{^}}test:
+; CHECK: Fetch clause
+; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
+; CHECK: Fetch clause
+; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
+define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+entry:
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
+  %1 = load i32, i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
new file mode 100644
index 000000000000..5cc7577cad33
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx4
+; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; CHECK: s_endpgm
+define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
+main_body:
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
+  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp12 = extractelement <4 x float> %tmp11, i32 0
+  %tmp13 = extractelement <4 x float> %tmp11, i32 1
+  call void @llvm.AMDGPU.barrier.global() #1
+  %tmp14 = extractelement <4 x float> %tmp11, i32 2
+;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
+  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 0
+  %tmp20 = extractelement <4 x float> %tmp18, i32 1
+  %tmp21 = extractelement <4 x float> %tmp18, i32 2
+  %tmp22 = extractelement <4 x float> %tmp18, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.global() #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
new file mode 100644
index 000000000000..4328e964c1bf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll
@@ -0,0 +1,238 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}ngroups_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].X
+
+; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ngroups_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ngroups_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @ngroups_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.ngroups.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}global_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @global_size_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.global.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @local_size_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.local.size.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}get_work_dim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[VVAL]]
+define void @get_work_dim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; The tgid values are stored in sgprs offset by the number of user sgprs.
+; Currently we always use exactly 2 user sgprs for the pointer to the
+; kernel arguments, but this may change in the future.
+
+; FUNC-LABEL: {{^}}tgid_x:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_y:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tgid_z:
+; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; GCN: buffer_store_dword [[VVAL]]
+define void @tgid_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tgid.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_x:
+; GCN: buffer_store_dword v0
+define void @tidig_x (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_y:
+; GCN: buffer_store_dword v1
+define void @tidig_y (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.y() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}tidig_z:
+; GCN: buffer_store_dword v2
+define void @tidig_z (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.z() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.ngroups.x() #0
+declare i32 @llvm.r600.read.ngroups.y() #0
+declare i32 @llvm.r600.read.ngroups.z() #0
+
+declare i32 @llvm.r600.read.global.size.x() #0
+declare i32 @llvm.r600.read.global.size.y() #0
+declare i32 @llvm.r600.read.global.size.z() #0
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+declare i32 @llvm.r600.read.tgid.x() #0
+declare i32 @llvm.r600.read.tgid.y() #0
+declare i32 @llvm.r600.read.tgid.z() #0
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.tidig.y() #0
+declare i32 @llvm.r600.read.tidig.z() #0
+
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
new file mode 100644
index 000000000000..8b383e4c393d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s
+
+; We want all MULLO_INT inst to be last in their instruction group
+;CHECK: {{^}}fill3d:
+;CHECK-NOT: MULLO_INT T[0-9]+
+
+define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
+  %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
+  %mul = mul i32 %y.i18, %x.i
+  %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1
+  %mul3 = mul i32 %mul, %z.i17
+  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
+  %mul26.i = mul i32 %x.i12.i, %x.i.i
+  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.i16 = add i32 %x.i4.i, %mul26.i
+  %mul7 = mul i32 %add.i16, %y.i18
+  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
+  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
+  %mul30.i = mul i32 %y.i14.i, %y.i.i
+  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %add.i14 = add i32 %mul30.i, %mul7
+  %mul819 = add i32 %add.i14, %y.i6.i
+  %add = mul i32 %mul819, %z.i17
+  %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1
+  %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1
+  %mul33.i = mul i32 %z.i16.i, %z.i.i
+  %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
+  %add.i = add i32 %z.i8.i, %mul33.i
+  %add13 = add i32 %add.i, %add
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13
+  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0, !1, !2}
+
+!0 = !{null}
+!1 = !{null}
+!2 = !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
new file mode 100644
index 000000000000..089db59eabc7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -0,0 +1,173 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}xor_v2i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}xor_v4i32:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
+  %result = xor <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}xor_i1:
+; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}}
+; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}}
+; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float, float addrspace(1) * %in0
+  %b = load float, float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 1.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_xor_i1:
+; SI: buffer_load_ubyte [[B:v[0-9]+]]
+; SI: buffer_load_ubyte [[A:v[0-9]+]]
+; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
+; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
+; SI: buffer_store_byte [[RESULT]]
+define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+  %a = load i1, i1 addrspace(1)* %in0
+  %b = load i1, i1 addrspace(1)* %in1
+  %xor = xor i1 %a, %b
+  store i1 %xor, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i32:
+; SI: v_xor_b32_e32
+define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_i32:
+; SI: s_xor_b32
+define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %result = xor i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_not_i32:
+; SI: s_not_b32
+define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_not_i32:
+; SI: v_not_b32
+define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32, i32 addrspace(1)* %in0
+  %b = load i32, i32 addrspace(1)* %in1
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_xor_i64:
+; SI: v_xor_b32_e32
+; SI: v_xor_b32_e32
+; SI: s_endpgm
+define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_xor_i64:
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}scalar_not_i64:
+; SI: s_not_b64
+define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}vector_not_i64:
+; SI: v_not_b32
+; SI: v_not_b32
+define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+  %a = load i64, i64 addrspace(1)* %in0
+  %b = load i64, i64 addrspace(1)* %in1
+  %result = xor i64 %a, -1
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test that we have a pattern to match xor inside a branch.
+; Note that in the future the backend may be smart enough to
+; use an SALU instruction for this.
+
+; FUNC-LABEL: {{^}}xor_cf:
+; SI: s_xor_b64
+define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = xor i64 %a, %b
+  br label %endif
+
+else:
+  %2 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
new file mode 100644
index 000000000000..033055db185a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+
+; R600: {{^}}test:
+; R600: MEM_RAT_CACHELESS STORE_RAW
+; R600: MEM_RAT_CACHELESS STORE_RAW
+
+; SI: {{^}}test:
+; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
+; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = mul i32 %a, %b
+  %1 = add i32 %0, %c
+  %2 = zext i32 %1 to i64
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}testi1toi32:
+; SI: v_cndmask_b32
+define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = zext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}zext_i1_to_i64:
+; SI: s_mov_b32 s{{[0-9]+}}, 0
+; SI: v_cmp_eq_i32
+; SI: v_cndmask_b32
+define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %ext = zext i1 %cmp to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
author	Dimitry Andric <dim@FreeBSD.org>	2015-06-21 13:59:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2015-06-21 13:59:01 +0000
commit	3a0822f094b578157263e04114075ad7df81db41 (patch)
tree	bc48361fe2cd1ca5f93ac01b38b183774468fc79 /test/CodeGen/AMDGPU
parent	85d8b2bbe386bcfe669575d05b61482d7be07e5d (diff)