30 files changed, 217 insertions, 35 deletions
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index 4b2dab47a20f..d2834be18b0b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; This fails to vectorize if the !alias.scope is not used
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index 87acb1057afb..b0dd5d185c77 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -1,7 +1,10 @@
-; RUN: opt -data-layout=A5 -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
-; RUN: opt -data-layout=A5 -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
+; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
 
 target triple = "amdgcn--"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; ALL-LABEL: @load_unknown_offset_align1_i8(
 ; ALL: alloca [128 x i8], align 1
@@ -63,10 +66,7 @@ define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noal
 ; ALL: alloca [128 x i32], align 16
 
 ; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; FIXME: Should change alignment
-; ALIGNED: load i32
-; ALIGNED: load i32
+; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
 define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16, addrspace(5)
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
@@ -127,5 +127,84 @@ define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noa
   ret void
 }
 
-attributes #0 = { nounwind }
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
+  %alloca = alloca [8 x i32], align 1, addrspace(5)
+  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  store i32 9, i32 addrspace(5)* %out, align 1
+  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
+  %alloca = alloca [8 x i8], align 1, addrspace(5)
+  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+  store i8 9, i8 addrspace(5)* %out, align 1
+  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
+  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
 
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
+  %alloca = alloca [8 x i32], align 1, addrspace(5)
+  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  %load0 = load i32, i32 addrspace(5)* %out, align 1
+  %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
+  %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
+  %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
+  %alloca = alloca [8 x i8], align 1, addrspace(5)
+  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+  %load0 = load i8, i8 addrspace(5)* %out, align 1
+  %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
+  %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
+  %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
index 220efd21fe19..cd1c7fdc521b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
@@ -1,4 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 declare i64 @_Z12get_local_idj(i32)
 
@@ -46,4 +49,4 @@ entry:
   %cstoreval2 = fptrunc double %storeval2 to float
   store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
   ret void
-}
-\ No newline at end of file
+}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 8a75b8743fa5..b8e95a6793e8 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
index 4de8b0fd7c6b..5bb6289ff19e 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
@@ -1,4 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; Check that vectorizer can find a GEP through bitcast
 ; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index 6182c09abcfe..35836f80456d 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; Check position of the inserted vector load/store.  Vectorized loads should be
 ; inserted at the position of the first load in the chain, and stores should be
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index 3f6d7ee7dcac..81ebb712e335 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; This is NOT OK to vectorize, as either load may alias either store.
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
index ffec12acbe0d..15c47716aafc 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; CHECK-LABEL: @interleave
 ; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index 43352783d101..4292cbcec850 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -5,15 +5,16 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-; ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
 
-; ELT8-UNALIGNED: store <2 x i32>
-; ELT8-UNALIGNED: store <2 x i32>
+; ELT8: store <2 x i32>
+; ELT8: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
@@ -166,18 +167,10 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(
 ; ELT4: store i32
 ; ELT4: store i32
 
-; ELT8-ALIGNED: store i32
-; ELT8-ALIGNED: store i32
-; ELT8-ALIGNED: store i32
+; ELT8: store <2 x i32>
+; ELT8: store i32
 
-; ELT8-UNALIGNED: store <2 x i32>
-; ELT8-UNALIGNED: store i32
-
-; ELT16-ALIGNED: store i32
-; ELT16-ALIGNED: store i32
-; ELT16-ALIGNED: store i32
-
-; ELT16-UNALIGNED: store <3 x i32>
+; ELT16: store <3 x i32>
 define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index 5eb3b25c1dc4..0d9a4184e718 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
 ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; TODO: Vector element tests
 ; TODO: Non-zero base offset for load and store combinations
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 226147df66a6..bcf2265f3100 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index f353106607d6..ff718c1b101e 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 @lds = internal addrspace(3) global [512 x float] undef, align 4
 
 ; The original load has an implicit alignment of 4, and should not
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index b684ca8c12ce..ffd651b2c65b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; Checks that there is no crash when there are multiple tails
 ; for a the same head starting a chain.
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
index 818189565b4c..86f6b6d55ec3 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 ; CHECK-LABEL: @no_implicit_float(
 ; CHECK: store i32
 ; CHECK: store i32
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
index 28d29f8e8139..8a2abe50a5af 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 65200b95d5e6..9290749bb898 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
index 32fe5eb9ce2a..c020cc71b4a6 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s
 
-target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) {
 ; CHECK-LABEL: @base_case
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
index 63e688e63fbb..5ed7ee80ea09 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 ; Check that, in the presence of an aliasing load, the stores preceding the
 ; aliasing load are safe to vectorize.
 
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
index 412d2013f6b6..65d114478b4b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
 
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
 ; Checks that we don't merge loads/stores of types smaller than one
 ; byte, or vectors with elements smaller than one byte.
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
index a9b72294d904..e29f3dfa537f 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
 ; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
+; RUN: opt                 -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
 
 target triple = "x86_64--"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
new file mode 100644
index 000000000000..e2181f6086c5
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
@@ -0,0 +1,77 @@
+; RUN: opt -load-store-vectorizer %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s
+
+; Check that setting wrapping flags after a SCEV node is created
+; does not invalidate "sorted by complexity" invariant for
+; operands of commutative and associative SCEV operators.
+
+target triple = "x86_64--"
+
+@global_value0 = external constant i32
+@global_value1 = external constant i32
+@other_value = external global float
+@a = external global float
+@b = external global float
+@c = external global float
+@d = external global float
+@plus1 = external global i32
+@cnd = external global i8
+
+; Function Attrs: nounwind
+define void @main() local_unnamed_addr #0 {
+; CHECK-LABEL: @main()
+; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>*
+; CHECK:  = load <2 x float>, <2 x float>* [[PTR]]
+; CHECK-LABEL: for.body23:
+entry:
+  %tmp = load i32, i32* @global_value0, !range !0
+  %tmp2 = load i32, i32* @global_value1
+  %and.i.i = and i32 %tmp2, 2
+  %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0
+  %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1
+  %and6.i.i = and i32 %tmp2, 3
+  %and9.i.i = and i32 %tmp2, 4
+  %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i
+  %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1
+  %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56
+  %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7
+  %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0
+  %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i
+  %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64
+  %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext
+  %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1
+  %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i
+  %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64
+  %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
+  %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1
+  br label %for.body23
+
+for.body23:                                       ; preds = %for.body23, %entry
+  %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
+  %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1
+  %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72
+  %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56
+  %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i
+  %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1
+  %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64
+  %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext
+  %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1
+  store float %preheader.load0., float* @a, align 4, !tbaa !1
+  store float %preheader.load1., float* @b, align 4, !tbaa !1
+  store float %loop.header.load0., float* @c, align 4, !tbaa !1
+  store float %loop.header.load1., float* @d, align 4, !tbaa !1
+  %loaded.cnd = load i8, i8* @cnd
+  %condition = trunc i8 %loaded.cnd to i1
+  br i1 %condition, label %for.body23, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{i32 0, i32 65536}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
diff --git a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
index fd2ae51fc1f0..043d6ea7e920 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
index a61b25119a14..ac5f3ea9f0f8 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
 
 define <8 x double> @loadwidth_insert_extract(double* %ptr) {
     %a = bitcast double* %ptr to <2 x double> *
diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
index b4493a8ab96c..a93e9aceb733 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
 ; RUN:     FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \
+; RUN:     FileCheck %s
 ;
 ; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
 ; single instruction. This test checks that we merge TBAA tags for such
diff --git a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
index 1f00f980eac1..7a0073808a01 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 %rec = type { i32, i28 }
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
index 12d882a51fa2..92d05f76fc61 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
index bf75ecf62955..3ae0d891dc54 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
index 915b94ac1557..72b29912d813 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
index 379b2353dc3d..00971f350388 100644
--- a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
+++ b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s
 
 ; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T.
 
diff --git a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
index 07bdc9123f9a..07487b578039 100644
--- a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
+++ b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s
+; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s
 
 declare void @llvm.sideeffect()