24 files changed, 251 insertions, 169 deletions
diff --git a/test/CodeGen/NVPTX/annotations.ll b/test/CodeGen/NVPTX/annotations.ll
index d93f688ef1fd1..39d52d382663a 100644
--- a/test/CodeGen/NVPTX/annotations.ll
+++ b/test/CodeGen/NVPTX/annotations.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
deleted file mode 100644
index 73c77f56bc9ce..0000000000000
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
-
-;; These tests should run for all targets
-
-;;===-- Basic instruction selection tests ---------------------------------===;;
-
-
-;;; f64
-
-define double @fadd_f64(double %a, double %b) {
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
-; CHECK: ret
-  %ret = fadd double %a, %b
-  ret double %ret
-}
-
-define double @fsub_f64(double %a, double %b) {
-; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
-; CHECK: ret
-  %ret = fsub double %a, %b
-  ret double %ret
-}
-
-define double @fmul_f64(double %a, double %b) {
-; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
-; CHECK: ret
-  %ret = fmul double %a, %b
-  ret double %ret
-}
-
-define double @fdiv_f64(double %a, double %b) {
-; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
-; CHECK: ret
-  %ret = fdiv double %a, %b
-  ret double %ret
-}
-
-;; PTX does not have a floating-point rem instruction
-
-
-;;; f32
-
-define float @fadd_f32(float %a, float %b) {
-; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret
-  %ret = fadd float %a, %b
-  ret float %ret
-}
-
-define float @fsub_f32(float %a, float %b) {
-; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret
-  %ret = fsub float %a, %b
-  ret float %ret
-}
-
-define float @fmul_f32(float %a, float %b) {
-; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret
-  %ret = fmul float %a, %b
-  ret float %ret
-}
-
-define float @fdiv_f32(float %a, float %b) {
-; CHECK: div.full.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
-; CHECK: ret
-  %ret = fdiv float %a, %b
-  ret float %ret
-}
-
-;; PTX does not have a floating-point rem instruction
diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
index 529f84900afd7..8d73b7e6c4c66 100644
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/calling-conv.ll b/test/CodeGen/NVPTX/calling-conv.ll
index 968203e5f70ec..190a1462adbca 100644
--- a/test/CodeGen/NVPTX/calling-conv.ll
+++ b/test/CodeGen/NVPTX/calling-conv.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll
index 12fc7548212cc..16af0a336ddc0 100644
--- a/test/CodeGen/NVPTX/compare-int.ll
+++ b/test/CodeGen/NVPTX/compare-int.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll
index 21c84379b0621..1882121fa7240 100644
--- a/test/CodeGen/NVPTX/convert-fp.ll
+++ b/test/CodeGen/NVPTX/convert-fp.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/convert-int-sm10.ll b/test/CodeGen/NVPTX/convert-int-sm10.ll
deleted file mode 100644
index 20716f982e3b4..0000000000000
--- a/test/CodeGen/NVPTX/convert-int-sm10.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
-
-
-; i16
-
-define i16 @cvt_i16_i32(i32 %x) {
-; CHECK: cvt.u16.u32 %rs{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: ret
-  %a = trunc i32 %x to i16
-  ret i16 %a
-}
-
-define i16 @cvt_i16_i64(i64 %x) {
-; CHECK: cvt.u16.u64 %rs{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: ret
-  %a = trunc i64 %x to i16
-  ret i16 %a
-}
-
-
-
-; i32
-
-define i32 @cvt_i32_i16(i16 %x) {
-; CHECK: cvt.u32.u16 %r{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: ret
-  %a = zext i16 %x to i32
-  ret i32 %a
-}
-
-define i32 @cvt_i32_i64(i64 %x) {
-; CHECK: cvt.u32.u64 %r{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: ret
-  %a = trunc i64 %x to i32
-  ret i32 %a
-}
-
-
-
-; i64
-
-define i64 @cvt_i64_i16(i16 %x) {
-; CHECK: cvt.u64.u16 %rl{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK: ret
-  %a = zext i16 %x to i64
-  ret i64 %a
-}
-
-define i64 @cvt_i64_i32(i32 %x) {
-; CHECK: cvt.u64.u32 %rl{{[0-9]+}}, %r{{[0-9]+}}
-; CHECK: ret
-  %a = zext i32 %x to i64
-  ret i64 %a
-}
diff --git a/test/CodeGen/NVPTX/intrin-nocapture.ll b/test/CodeGen/NVPTX/intrin-nocapture.ll
new file mode 100644
index 0000000000000..55781bb15a0b8
--- /dev/null
+++ b/test/CodeGen/NVPTX/intrin-nocapture.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+; Address space intrinsics were erroneously marked NoCapture, leading to bad
+; optimizations (such as the store below being eliminated as dead code). This
+; test makes sure we don't regress.
+
+declare void @foo(i32 addrspace(1)*)
+
+declare i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32*)
+
+; CHECK: @bar
+define void @bar() {
+  %t1 = alloca i32
+; CHECK: call i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32* %t1)
+; CHECK-NEXT: store i32 10, i32* %t1
+  %t2 = call i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32* %t1)
+  store i32 10, i32* %t1
+  call void @foo(i32 addrspace(1)* %t2)
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index 1c9879c4178b9..53a28f333798d 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index afab60ca96a8a..8b0357be87cb0 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -1,5 +1,3 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 
diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
index d1f5093df223e..3265868d3c524 100644
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
 
 
diff --git a/test/CodeGen/NVPTX/nvvm-reflect.ll b/test/CodeGen/NVPTX/nvvm-reflect.ll
new file mode 100644
index 0000000000000..0d02194651e34
--- /dev/null
+++ b/test/CodeGen/NVPTX/nvvm-reflect.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0
+; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1
+
+@str = private addrspace(4) unnamed_addr constant [8 x i8] c"USE_MUL\00"
+
+declare i32 @__nvvm_reflect(i8*)
+declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*)
+
+define float @foo(float %a, float %b) {
+; USE_MUL_0: define float @foo
+; USE_MUL_0-NOT: call i32 @__nvvm_reflect
+; USE_MUL_1: define float @foo
+; USE_MUL_1-NOT: call i32 @__nvvm_reflect
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8] addrspace(4)* @str, i32 0, i32 0))
+  %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
+  %cmp = icmp ugt i32 %reflect, 0
+  br i1 %cmp, label %use_mul, label %use_add
+
+use_mul:
+; USE_MUL_1: fmul float %a, %b
+; USE_MUL_0-NOT: fadd float %a, %b
+  %ret1 = fmul float %a, %b
+  br label %exit
+
+use_add:
+; USE_MUL_0: fadd float %a, %b
+; USE_MUL_1-NOT: fmul float %a, %b
+  %ret2 = fadd float %a, %b
+  br label %exit
+
+exit:
+  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
+  ret float %ret
+}
diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll
new file mode 100644
index 0000000000000..03ab635e73b93
--- /dev/null
+++ b/test/CodeGen/NVPTX/sched1.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; Ensure source scheduling is working
+
+define void @foo(i32* %a) {
+; CHECK: .func foo
+; CHECK: ld.u32
+; CHECK-NEXT: ld.u32
+; CHECK-NEXT: ld.u32
+; CHECK-NEXT: ld.u32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+  %ptr0 = getelementptr i32* %a, i32 0
+  %val0 = load i32* %ptr0
+  %ptr1 = getelementptr i32* %a, i32 1
+  %val1 = load i32* %ptr1
+  %ptr2 = getelementptr i32* %a, i32 2
+  %val2 = load i32* %ptr2
+  %ptr3 = getelementptr i32* %a, i32 3
+  %val3 = load i32* %ptr3
+
+  %t0 = add i32 %val0, %val1
+  %t1 = add i32 %t0, %val2
+  %t2 = add i32 %t1, %val3
+
+  store i32 %t2, i32* %a
+
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll
new file mode 100644
index 0000000000000..71a9a4963fafe
--- /dev/null
+++ b/test/CodeGen/NVPTX/sched2.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+define void @foo(<2 x i32>* %a) {
+; CHECK: .func foo
+; CHECK: ld.v2.u32
+; CHECK-NEXT: ld.v2.u32
+; CHECK-NEXT: ld.v2.u32
+; CHECK-NEXT: ld.v2.u32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+; CHECK-NEXT: add.s32
+  %ptr0 = getelementptr <2 x i32>* %a, i32 0
+  %val0 = load <2 x i32>* %ptr0
+  %ptr1 = getelementptr <2 x i32>* %a, i32 1
+  %val1 = load <2 x i32>* %ptr1
+  %ptr2 = getelementptr <2 x i32>* %a, i32 2
+  %val2 = load <2 x i32>* %ptr2
+  %ptr3 = getelementptr <2 x i32>* %a, i32 3
+  %val3 = load <2 x i32>* %ptr3
+
+  %t0 = add <2 x i32> %val0, %val1
+  %t1 = add <2 x i32> %t0, %val2
+  %t2 = add <2 x i32> %t1, %val3
+
+  store <2 x i32> %t2, <2 x i32>* %a
+
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/sm-version-10.ll b/test/CodeGen/NVPTX/sm-version-10.ll
deleted file mode 100644
index 9324a3780986c..0000000000000
--- a/test/CodeGen/NVPTX/sm-version-10.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
-
-
-; CHECK: .target sm_10
-
diff --git a/test/CodeGen/NVPTX/sm-version-11.ll b/test/CodeGen/NVPTX/sm-version-11.ll
deleted file mode 100644
index 9033a4eba5e46..0000000000000
--- a/test/CodeGen/NVPTX/sm-version-11.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_11 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_11 | FileCheck %s
-
-
-; CHECK: .target sm_11
-
diff --git a/test/CodeGen/NVPTX/sm-version-12.ll b/test/CodeGen/NVPTX/sm-version-12.ll
deleted file mode 100644
index d8ee85c9010e7..0000000000000
--- a/test/CodeGen/NVPTX/sm-version-12.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_12 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_12 | FileCheck %s
-
-
-; CHECK: .target sm_12
-
diff --git a/test/CodeGen/NVPTX/sm-version-13.ll b/test/CodeGen/NVPTX/sm-version-13.ll
deleted file mode 100644
index ad67d642ce306..0000000000000
--- a/test/CodeGen/NVPTX/sm-version-13.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_13 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_13 | FileCheck %s
-
-
-; CHECK: .target sm_13
-
diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll
index 54e04ae6106d2..0b26d802df841 100644
--- a/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/test/CodeGen/NVPTX/st-addrspace.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
 
 
diff --git a/test/CodeGen/NVPTX/tuple-literal.ll b/test/CodeGen/NVPTX/tuple-literal.ll
new file mode 100644
index 0000000000000..2b1f2c4b6680a
--- /dev/null
+++ b/test/CodeGen/NVPTX/tuple-literal.ll
@@ -0,0 +1,5 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20
+
+define ptx_device void @test_function({i8, i8}*) {
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-args.ll b/test/CodeGen/NVPTX/vector-args.ll
new file mode 100644
index 0000000000000..80deae46935af
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-args.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+define float @foo(<2 x float> %a) {
+; CHECK: .func (.param .b32 func_retval0) foo
+; CHECK: .param .align 8 .b8 foo_param_0[8]
+; CHECK: ld.param.f32 %f{{[0-9]+}}
+; CHECK: ld.param.f32 %f{{[0-9]+}}
+  %t1 = fmul <2 x float> %a, %a
+  %t2 = extractelement <2 x float> %t1, i32 0
+  %t3 = extractelement <2 x float> %t1, i32 1
+  %t4 = fadd float %t2, %t3
+  ret float %t4
+}
+
+
+define float @bar(<4 x float> %a) {
+; CHECK: .func (.param .b32 func_retval0) bar
+; CHECK: .param .align 16 .b8 bar_param_0[16]
+; CHECK: ld.param.f32 %f{{[0-9]+}}
+; CHECK: ld.param.f32 %f{{[0-9]+}}
+  %t1 = fmul <4 x float> %a, %a
+  %t2 = extractelement <4 x float> %t1, i32 0
+  %t3 = extractelement <4 x float> %t1, i32 1
+  %t4 = fadd float %t2, %t3
+  ret float %t4
+}
diff --git a/test/CodeGen/NVPTX/vector-compare.ll b/test/CodeGen/NVPTX/vector-compare.ll
new file mode 100644
index 0000000000000..2180499952336
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-compare.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+
+; This test makes sure that the result of vector compares are properly
+; scalarized.  If codegen fails, then the type legalizer incorrectly
+; tried to promote <2 x i1> to <2 x i8> and instruction selection failed.
+
+define void @foo(<2 x i32>* %a, <2 x i32>* %b, i32* %r1, i32* %r2) {
+  %aval = load <2 x i32>* %a
+  %bval = load <2 x i32>* %b
+  %res = icmp slt <2 x i32> %aval, %bval
+  %t1 = extractelement <2 x i1> %res, i32 0
+  %t2 = extractelement <2 x i1> %res, i32 1
+  %t1a = zext i1 %t1 to i32
+  %t2a = zext i1 %t2 to i32
+  store i32 %t1a, i32* %r1
+  store i32 %t2a, i32* %r2
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-loads.ll b/test/CodeGen/NVPTX/vector-loads.ll
new file mode 100644
index 0000000000000..58882bf166681
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-loads.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; Even though general vector types are not supported in PTX, we can still
+; optimize loads/stores with pseudo-vector instructions of the form:
+;
+; ld.v2.f32 {%f0, %f1}, [%r0]
+;
+; which will load two floats at once into scalar registers.
+
+define void @foo(<2 x float>* %a) {
+; CHECK: .func foo
+; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
+  %t1 = load <2 x float>* %a
+  %t2 = fmul <2 x float> %t1, %t1
+  store <2 x float> %t2, <2 x float>* %a
+  ret void
+}
+
+define void @foo2(<4 x float>* %a) {
+; CHECK: .func foo2
+; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+  %t1 = load <4 x float>* %a
+  %t2 = fmul <4 x float> %t1, %t1
+  store <4 x float> %t2, <4 x float>* %a
+  ret void
+}
+
+define void @foo3(<8 x float>* %a) {
+; CHECK: .func foo3
+; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+  %t1 = load <8 x float>* %a
+  %t2 = fmul <8 x float> %t1, %t1
+  store <8 x float> %t2, <8 x float>* %a
+  ret void
+}
+
+
+
+define void @foo4(<2 x i32>* %a) {
+; CHECK: .func foo4
+; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
+  %t1 = load <2 x i32>* %a
+  %t2 = mul <2 x i32> %t1, %t1
+  store <2 x i32> %t2, <2 x i32>* %a
+  ret void
+}
+
+define void @foo5(<4 x i32>* %a) {
+; CHECK: .func foo5
+; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+  %t1 = load <4 x i32>* %a
+  %t2 = mul <4 x i32> %t1, %t1
+  store <4 x i32> %t2, <4 x i32>* %a
+  ret void
+}
+
+define void @foo6(<8 x i32>* %a) {
+; CHECK: .func foo6
+; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+  %t1 = load <8 x i32>* %a
+  %t2 = mul <8 x i32> %t1, %t1
+  store <8 x i32> %t2, <8 x i32>* %a
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-select.ll b/test/CodeGen/NVPTX/vector-select.ll
new file mode 100644
index 0000000000000..11893df10329b
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-select.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20
+
+; This test makes sure that vector selects are scalarized by the type legalizer.
+; If not, type legalization will fail.
+
+define void @foo(<2 x i32> addrspace(1)* %def_a, <2 x i32> addrspace(1)* %def_b, <2 x i32> addrspace(1)* %def_c) {
+entry:
+  %tmp4 = load <2 x i32> addrspace(1)* %def_a
+  %tmp6 = load <2 x i32> addrspace(1)* %def_c
+  %tmp8 = load <2 x i32> addrspace(1)* %def_b
+  %0 = icmp sge <2 x i32> %tmp4, zeroinitializer
+  %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8
+  store <2 x i32> %cond, <2 x i32> addrspace(1)* %def_c
+  ret void
+}