Vendor import of clang trunk r304222: - src

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2017-05-30 17:37:44 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-05-30 17:37:44 +0000
commit	550ae89a710bf458d47e5b1d183f5e7039c2b384 (patch)
tree	4eab680d9198cddf87acb23a14c836472b21ae89 /test/CodeGen
parent	b5aee35cc5d62f11d98539f62e4fe63f0ac9edc6 (diff)

vendor/clang/clang-trunk-r304222

Notes

Diffstat (limited to 'test/CodeGen')

-rw-r--r--

test/CodeGen/altivec-ct.c

-rw-r--r--

test/CodeGen/arm_neon_intrinsics.c

1239

-rw-r--r--

test/CodeGen/union-align.c

3 files changed, 704 insertions, 621 deletions

diff --git a/test/CodeGen/altivec-ct.c b/test/CodeGen/altivec-ct.c
new file mode 100644
index 000000000000..1a3e14dc1fd5
--- /dev/null
+++ b/test/CodeGen/altivec-ct.c

@@ -0,0 +1,82 @@

+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature +vsx | FileCheck %s -check-prefix=CHECK -check-prefix=VSX

+// RUN: %clang_cc1 -triple powerpc-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature -vsx | FileCheck %s

+// REQUIRES: powerpc-registered-target

+#include <altivec.h>

+// CHECK-LABEL: test1

+// CHECK: vcfsx

+vector float test1(vector int x) {

+ return vec_ctf(x, 0);

+// CHECK-LABEL: test2

+// CHECK: vcfux

+vector float test2(vector unsigned int x) {

+ return vec_ctf(x, 0);

+#ifdef __VSX__

+// VSX-LABEL: test3

+vector double test3(vector signed long long x) {

+ return vec_ctf(x, 0);

+// VSX-LABEL: test4

+vector double test4(vector unsigned long long x) {

+ return vec_ctf(x, 0);

+#endif

+// CHECK-LABEL: test5

+// CHECK: vcfsx

+vector float test5(vector int x) {

+ return vec_vcfsx(x, 0);

+// CHECK-LABEL: test6

+// CHECK: vcfux

+vector float test6(vector unsigned int x) {

+ return vec_vcfux(x, 0);

+// CHECK-LABEL: test7

+// CHECK: vctsxs

+vector int test7(vector float x) {

+ return vec_cts(x, 0);

+#ifdef __VSX__

+// VSX-LABEL: test8

+vector signed long long test8(vector double x) {

+ return vec_cts(x, 0);

+#endif

+// CHECK-LABEL: test9

+// CHECK: vctsxs

+vector int test9(vector float x) {

+ return vec_vctsxs(x, 0);

+// CHECK-LABEL: test10

+// CHECK: vctuxs

+vector unsigned test10(vector float x) {

+ return vec_ctu(x, 0);

+#ifdef __VSX__

+// VSX-LABEL: test11

+vector unsigned long long test11(vector double x) {

+ return vec_ctu(x, 0);

+#endif

+// CHECK-LABEL: test12

+// CHECK: vctuxs

+vector unsigned test12(vector float x) {

+ return vec_vctuxs(x, 0);

diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index a8b03b5d9b0b..ae7c78e08f86 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c

@@ -1,5 +1,6 @@

// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\

-// RUN: -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \

+// RUN: -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding \

+// RUN: -disable-O0-optnone -emit-llvm -o - %s \

// RUN: | opt -S -mem2reg | FileCheck %s

// REQUIRES: long-tests

@@ -3480,11 +3481,11 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {

}

// CHECK-LABEL: @test_vgetq_lane_f16(

-// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16

+// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 8

// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2

-// CHECK: store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16

+// CHECK: store <8 x half> %a, <8 x half>* [[__REINT_244]], align 8

// CHECK: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*

-// CHECK: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16

+// CHECK: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 8

// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>

// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>

// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3

@@ -4542,7 +4543,7 @@ poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {

}

// CHECK-LABEL: @test_vld2q_u8(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>

uint8x16x2_t test_vld2q_u8(uint8_t const * a) {

@@ -4550,7 +4551,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const * a) {

}

// CHECK-LABEL: @test_vld2q_u16(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>

@@ -4559,7 +4560,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const * a) {

}

// CHECK-LABEL: @test_vld2q_u32(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>

@@ -4568,7 +4569,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const * a) {

}

// CHECK-LABEL: @test_vld2q_s8(

-// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>

int8x16x2_t test_vld2q_s8(int8_t const * a) {

@@ -4576,7 +4577,7 @@ int8x16x2_t test_vld2q_s8(int8_t const * a) {

}

// CHECK-LABEL: @test_vld2q_s16(

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>

@@ -4585,7 +4586,7 @@ int16x8x2_t test_vld2q_s16(int16_t const * a) {

}

// CHECK-LABEL: @test_vld2q_s32(

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>

@@ -4594,7 +4595,7 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {

}

// CHECK-LABEL: @test_vld2q_f16(

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>

@@ -4603,7 +4604,7 @@ float16x8x2_t test_vld2q_f16(float16_t const * a) {

}

// CHECK-LABEL: @test_vld2q_f32(

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>

@@ -4612,7 +4613,7 @@ float32x4x2_t test_vld2q_f32(float32_t const * a) {

}

// CHECK-LABEL: @test_vld2q_p8(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>

poly8x16x2_t test_vld2q_p8(poly8_t const * a) {

@@ -4620,7 +4621,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const * a) {

}

// CHECK-LABEL: @test_vld2q_p16(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>

@@ -4839,24 +4840,24 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {

}

// CHECK-LABEL: @test_vld2q_lane_u16(

-// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -4866,24 +4867,24 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_u32(

-// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -4893,24 +4894,24 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_s16(

-// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -4920,24 +4921,24 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_s32(

-// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -4947,24 +4948,24 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_f16(

-// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -4974,24 +4975,24 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_f32(

-// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>

@@ -5001,24 +5002,24 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {

}

// CHECK-LABEL: @test_vld2q_lane_p16(

-// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -5283,7 +5284,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {

}

// CHECK-LABEL: @test_vld3q_u8(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>

uint8x16x3_t test_vld3q_u8(uint8_t const * a) {

@@ -5291,7 +5292,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const * a) {

}

// CHECK-LABEL: @test_vld3q_u16(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>

@@ -5300,7 +5301,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const * a) {

}

// CHECK-LABEL: @test_vld3q_u32(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>

@@ -5309,7 +5310,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const * a) {

}

// CHECK-LABEL: @test_vld3q_s8(

-// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>

int8x16x3_t test_vld3q_s8(int8_t const * a) {

@@ -5317,7 +5318,7 @@ int8x16x3_t test_vld3q_s8(int8_t const * a) {

}

// CHECK-LABEL: @test_vld3q_s16(

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>

@@ -5326,7 +5327,7 @@ int16x8x3_t test_vld3q_s16(int16_t const * a) {

}

// CHECK-LABEL: @test_vld3q_s32(

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>

@@ -5335,7 +5336,7 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {

}

// CHECK-LABEL: @test_vld3q_f16(

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>

@@ -5344,7 +5345,7 @@ float16x8x3_t test_vld3q_f16(float16_t const * a) {

}

// CHECK-LABEL: @test_vld3q_f32(

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>

@@ -5353,7 +5354,7 @@ float32x4x3_t test_vld3q_f32(float32_t const * a) {

}

// CHECK-LABEL: @test_vld3q_p8(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>

poly8x16x3_t test_vld3q_p8(poly8_t const * a) {

@@ -5361,7 +5362,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const * a) {

}

// CHECK-LABEL: @test_vld3q_p16(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>

@@ -5580,28 +5581,28 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {

}

// CHECK-LABEL: @test_vld3q_lane_u16(

-// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -5612,28 +5613,28 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_u32(

-// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -5644,28 +5645,28 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_s16(

-// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -5676,28 +5677,28 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_s32(

-// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -5708,28 +5709,28 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_f16(

-// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -5740,28 +5741,28 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_f32(

-// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>

@@ -5772,28 +5773,28 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {

}

// CHECK-LABEL: @test_vld3q_lane_p16(

-// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*

-// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16

+// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -6103,7 +6104,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {

}

// CHECK-LABEL: @test_vld4q_u8(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>

uint8x16x4_t test_vld4q_u8(uint8_t const * a) {

@@ -6111,7 +6112,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const * a) {

}

// CHECK-LABEL: @test_vld4q_u16(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>

@@ -6120,7 +6121,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const * a) {

}

// CHECK-LABEL: @test_vld4q_u32(

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>

@@ -6129,7 +6130,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const * a) {

}

// CHECK-LABEL: @test_vld4q_s8(

-// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>

int8x16x4_t test_vld4q_s8(int8_t const * a) {

@@ -6137,7 +6138,7 @@ int8x16x4_t test_vld4q_s8(int8_t const * a) {

}

// CHECK-LABEL: @test_vld4q_s16(

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>

@@ -6146,7 +6147,7 @@ int16x8x4_t test_vld4q_s16(int16_t const * a) {

}

// CHECK-LABEL: @test_vld4q_s32(

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>

@@ -6155,7 +6156,7 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {

}

// CHECK-LABEL: @test_vld4q_f16(

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast half* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>

@@ -6164,7 +6165,7 @@ float16x8x4_t test_vld4q_f16(float16_t const * a) {

}

// CHECK-LABEL: @test_vld4q_f32(

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast float* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>

@@ -6173,7 +6174,7 @@ float32x4x4_t test_vld4q_f32(float32_t const * a) {

}

// CHECK-LABEL: @test_vld4q_p8(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>

poly8x16x4_t test_vld4q_p8(poly8_t const * a) {

@@ -6181,7 +6182,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const * a) {

}

// CHECK-LABEL: @test_vld4q_p16(

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8

// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>

@@ -6400,32 +6401,32 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {

}

// CHECK-LABEL: @test_vld4q_lane_u16(

-// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -6437,32 +6438,32 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_u32(

-// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -6474,32 +6475,32 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_s16(

-// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -6511,32 +6512,32 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_s32(

-// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>

@@ -6548,32 +6549,32 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_f16(

-// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -6585,32 +6586,32 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_f32(

-// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>

@@ -6622,32 +6623,32 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {

}

// CHECK-LABEL: @test_vld4q_lane_p16(

-// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16

-// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8

+// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*

-// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16

+// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)

// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*

// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8

// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8

// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>

// CHECK: [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2

-// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16

+// CHECK: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8

// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>

// CHECK: [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3

-// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16

+// CHECK: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8

// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>

// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>

// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>

@@ -14548,21 +14549,21 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {

// CHECK-LABEL: @test_vsetq_lane_f16(

// CHECK: [[__REINT_248:%.*]] = alloca half, align 2

-// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16

-// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16

+// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 8

+// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 8

// CHECK: [[TMP0:%.*]] = load half, half* %a, align 2

// CHECK: store half [[TMP0]], half* [[__REINT_248]], align 2

-// CHECK: store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16

+// CHECK: store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 8

// CHECK: [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*

// CHECK: [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2

// CHECK: [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*

-// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16

+// CHECK: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 8

// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>

// CHECK: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>

// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3

-// CHECK: store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16

+// CHECK: store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 8

// CHECK: [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*

-// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16

+// CHECK: [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 8

// CHECK: ret <8 x half> [[TMP8]]

float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {

return vsetq_lane_f16(*a, b, 3);

@@ -16193,20 +16194,20 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {

}

// CHECK-LABEL: @test_vst2q_u8(

-// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16

-// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16

+// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 8

+// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 8

// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0

// CHECK: [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*

-// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16

+// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8

// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*

// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*

-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)

+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)

// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0

-// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16

+// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8

// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0

// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1

-// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16

+// CHECK: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8

// CHECK: call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)

// CHECK: ret void

void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {

@@ -16214,22 +16215,22 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {

}

// CHECK-LABEL: @test_vst2q_u16(