diff options
Diffstat (limited to 'test/Transforms/LoadStoreVectorizer')
30 files changed, 217 insertions, 35 deletions
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll index 4b2dab47a20f..d2834be18b0b 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll @@ -1,7 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; This fails to vectorize if the !alias.scope is not used diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll index 87acb1057afb..b0dd5d185c77 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll @@ -1,7 +1,10 @@ -; RUN: opt -data-layout=A5 -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s -; RUN: opt -data-layout=A5 -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s +; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s +; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s +; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s +; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s target triple = "amdgcn--" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; ALL-LABEL: @load_unknown_offset_align1_i8( ; ALL: alloca [128 x i8], align 1 @@ -63,10 +66,7 @@ define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noal ; ALL: alloca [128 x i32], align 16 ; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} - -; FIXME: Should change alignment -; ALIGNED: load i32 -; ALIGNED: load i32 +; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}} define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { %alloca = alloca [128 x i32], align 16, addrspace(5) %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset @@ -127,5 +127,84 @@ define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noa ret void } -attributes #0 = { nounwind } +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + store i32 9, i32 addrspace(5)* %out, align 1 + store i32 1, i32 addrspace(5)* %out.gep.1, align 1 + store i32 23, i32 addrspace(5)* %out.gep.2, align 1 + store i32 19, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + store i8 9, i8 addrspace(5)* %out, align 1 + store i8 1, i8 addrspace(5)* %out.gep.1, align 1 + store i8 23, i8 addrspace(5)* %out.gep.2, align 1 + store i8 19, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32( +; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5) +; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5) +; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() { + %alloca = alloca [8 x i32], align 1, addrspace(5) + %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 + + %load0 = load i32, i32 addrspace(5)* %out, align 1 + %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1 + %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1 + %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1 + ret void +} + +; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8( +; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5) +; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4 + +; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5) +; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1 +define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { + %alloca = alloca [8 x i8], align 1, addrspace(5) + %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 + + %load0 = load i8, i8 addrspace(5)* %out, align 1 + %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1 + %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1 + %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1 + ret void +} + +attributes #0 = { nounwind } diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll index 220efd21fe19..cd1c7fdc521b 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll @@ -1,4 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" declare i64 @_Z12get_local_idj(i32) @@ -46,4 +49,4 @@ entry: %cstoreval2 = fptrunc double %storeval2 to float store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4 ret void -}
\ No newline at end of file +} diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll index 8a75b8743fa5..b8e95a6793e8 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll index 4de8b0fd7c6b..5bb6289ff19e 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll @@ -1,4 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; Check that vectorizer can find a GEP through bitcast ; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll index 6182c09abcfe..35836f80456d 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; Check position of the inserted vector load/store. Vectorized loads should be ; inserted at the position of the first load in the chain, and stores should be diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll index 3f6d7ee7dcac..81ebb712e335 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; This is NOT OK to vectorize, as either load may alias either store. diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll index ffec12acbe0d..15c47716aafc 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll @@ -1,6 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; CHECK-LABEL: @interleave ; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}} diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll index 43352783d101..4292cbcec850 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -5,15 +5,16 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store <2 x i32> +; ELT8: store <2 x i32> +; ELT8: store <2 x i32> ; ELT16-UNALIGNED: store <4 x i32> define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { @@ -166,18 +167,10 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8( ; ELT4: store i32 ; ELT4: store i32 -; ELT8-ALIGNED: store i32 -; ELT8-ALIGNED: store i32 -; ELT8-ALIGNED: store i32 +; ELT8: store <2 x i32> +; ELT8: store i32 -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store i32 - -; ELT16-ALIGNED: store i32 -; ELT16-ALIGNED: store i32 -; ELT16-ALIGNED: store i32 - -; ELT16-UNALIGNED: store <3 x i32> +; ELT16: store <3 x i32> define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll index 5eb3b25c1dc4..0d9a4184e718 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -1,7 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; TODO: Vector element tests ; TODO: Non-zero base offset for load and store combinations diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll index 226147df66a6..bcf2265f3100 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; CHECK-LABEL: @merge_v2i32_v2i32( ; CHECK: load <4 x i32> diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll index f353106607d6..ff718c1b101e 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + @lds = internal addrspace(3) global [512 x float] undef, align 4 ; The original load has an implicit alignment of 4, and should not diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll index b684ca8c12ce..ffd651b2c65b 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; Checks that there is no crash when there are multiple tails ; for a the same head starting a chain. diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll index 818189565b4c..86f6b6d55ec3 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + ; CHECK-LABEL: @no_implicit_float( ; CHECK: store i32 ; CHECK: store i32 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll index 28d29f8e8139..8a2abe50a5af 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + ; CHECK-LABEL: @optnone( ; CHECK: store i32 ; CHECK: store i32 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll index 65200b95d5e6..9290749bb898 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll index 32fe5eb9ce2a..c020cc71b4a6 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) { ; CHECK-LABEL: @base_case diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll index 63e688e63fbb..5ed7ee80ea09 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + ; Check that, in the presence of an aliasing load, the stores preceding the ; aliasing load are safe to vectorize. diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll index 412d2013f6b6..65d114478b4b 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + ; Checks that we don't merge loads/stores of types smaller than one ; byte, or vectors with elements smaller than one byte. diff --git a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll index a9b72294d904..e29f3dfa537f 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll @@ -1,5 +1,7 @@ ; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s ; RUN: opt -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s target triple = "x86_64--" diff --git a/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll new file mode 100644 index 000000000000..e2181f6086c5 --- /dev/null +++ b/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll @@ -0,0 +1,77 @@ +; RUN: opt -load-store-vectorizer %s -S | FileCheck %s +; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s + +; Check that setting wrapping flags after a SCEV node is created +; does not invalidate "sorted by complexity" invariant for +; operands of commutative and associative SCEV operators. + +target triple = "x86_64--" + +@global_value0 = external constant i32 +@global_value1 = external constant i32 +@other_value = external global float +@a = external global float +@b = external global float +@c = external global float +@d = external global float +@plus1 = external global i32 +@cnd = external global i8 + +; Function Attrs: nounwind +define void @main() local_unnamed_addr #0 { +; CHECK-LABEL: @main() +; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>* +; CHECK: = load <2 x float>, <2 x float>* [[PTR]] +; CHECK-LABEL: for.body23: +entry: + %tmp = load i32, i32* @global_value0, !range !0 + %tmp2 = load i32, i32* @global_value1 + %and.i.i = and i32 %tmp2, 2 + %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0 + %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1 + %and6.i.i = and i32 %tmp2, 3 + %and9.i.i = and i32 %tmp2, 4 + %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i + %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1 + %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56 + %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7 + %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0 + %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i + %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64 + %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext + %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1 + %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i + %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64 + %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1 + br label %for.body23 + +for.body23: ; preds = %for.body23, %entry + %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext + %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1 + %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72 + %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56 + %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i + %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1 + %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64 + %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext + %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1 + store float %preheader.load0., float* @a, align 4, !tbaa !1 + store float %preheader.load1., float* @b, align 4, !tbaa !1 + store float %loop.header.load0., float* @c, align 4, !tbaa !1 + store float %loop.header.load1., float* @d, align 4, !tbaa !1 + %loaded.cnd = load i8, i8* @cnd + %condition = trunc i8 %loaded.cnd to i1 + br i1 %condition, label %for.body23, label %exit + +exit: + ret void +} + +attributes #0 = { nounwind } + +!0 = !{i32 0, i32 65536} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} diff --git a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll index fd2ae51fc1f0..043d6ea7e920 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll index a61b25119a14..ac5f3ea9f0f8 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s define <8 x double> @loadwidth_insert_extract(double* %ptr) { %a = bitcast double* %ptr to <2 x double> * diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll index b4493a8ab96c..a93e9aceb733 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \ ; RUN: FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \ +; RUN: FileCheck %s ; ; The GPU Load & Store Vectorizer may merge differently-typed accesses into a ; single instruction. This test checks that we merge TBAA tags for such diff --git a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll index 1f00f980eac1..7a0073808a01 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s +; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %rec = type { i32, i28 } diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll index 12d882a51fa2..92d05f76fc61 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" diff --git a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll index bf75ecf62955..3ae0d891dc54 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll index 915b94ac1557..72b29912d813 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll index 379b2353dc3d..00971f350388 100644 --- a/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll +++ b/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll @@ -1,4 +1,5 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s ; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T. diff --git a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll index 07bdc9123f9a..07487b578039 100644 --- a/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll +++ b/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll @@ -1,4 +1,5 @@ ; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s +; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s declare void @llvm.sideeffect() |
