diff options
Diffstat (limited to 'test/Transforms')
105 files changed, 7724 insertions, 820 deletions
diff --git a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll index 509a4d7bfa18..8313cfac04ee 100644 --- a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll +++ b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that the CodeExtractor ; properly sets the entry count for the function that is diff --git a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll index 425e96973596..8e362080dc48 100644 --- a/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll +++ b/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that CodeExtractor updates ; the exit branch probabilities for multiple exit blocks. diff --git a/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/test/Transforms/CodeExtractor/PartialInlineAnd.ll index e981a5ba5816..d32d834d2df3 100644 --- a/test/Transforms/CodeExtractor/PartialInlineAnd.ll +++ b/test/Transforms/CodeExtractor/PartialInlineAnd.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -partial-inliner -S | FileCheck %s ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s -; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s ; Function Attrs: nounwind uwtable define i32 @bar(i32 %arg) local_unnamed_addr #0 { diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll new file mode 100644 index 000000000000..3a7a9752e507 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll @@ -0,0 +1,41 @@ +; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s + +define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 { +; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]] +entry: + br i1 %cond, label %if.then, label %return +if.then: + ; Dummy store to have more than 0 uses + store i32 10, i32* %align.val, align 4 + br label %return +return: ; preds = %entry + ret i32 0 +} + +define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{ +entry: +; CHECK-LABEL: @Caller1 +; CHECK: br +; CHECK: call void @Func.1_ +; CHECK: br +; CHECK: call void @Func.1_ + %val = call i32 @Func(i1 %cond, i32* %align.val) + %val2 = call i32 @Func(i1 %cond, i32* %align.val) + ret i32 %val +} + +define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{ +entry: +; CHECK-LABEL: @Caller2 +; CHECK: br +; CHECK: call void @Func.1_ + %val = call i32 @Func(i1 %cond, i32* %align.val) + ret i32 %val +} + +; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 10} +!3 = !{!"function_entry_count", i64 20} + diff --git a/test/Transforms/CodeExtractor/PartialInlineHighCost.ll b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll new file mode 100644 index 000000000000..e43a94dc6c37 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineHighCost.ll @@ -0,0 +1,107 @@ +; The outlined region has high frequency and the outlining +; call sequence is expensive (input, output, multiple exit etc) +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s + + +; Function Attrs: nounwind +define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = icmp slt i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb16, !prof !1 + +bb1: ; preds = %bb + %tmp2 = tail call i32 (...) @foo() #0 + %tmp3 = tail call i32 (...) @foo() #0 + %tmp4 = tail call i32 (...) @foo() #0 + %tmp5 = tail call i32 (...) @foo() #0 + %tmp6 = tail call i32 (...) @foo() #0 + %tmp7 = tail call i32 (...) @foo() #0 + %tmp8 = add nsw i32 %arg, 1 + %tmp9 = tail call i32 @goo(i32 %tmp8) #0 + %tmp10 = tail call i32 (...) @foo() #0 + %tmp11 = icmp eq i32 %tmp10, 0 + br i1 %tmp11, label %bb12, label %bb16 + +bb12: ; preds = %bb1 + %tmp13 = tail call i32 (...) @foo() #0 + %tmp14 = icmp eq i32 %tmp13, 0 + %tmp15 = select i1 %tmp14, i32 0, i32 3 + br label %bb16 + +bb16: ; preds = %bb12, %bb1, %bb + %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ] + ret i32 %tmp17 +} + +define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = icmp slt i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb16, !prof !2 + +bb1: ; preds = %bb + %tmp2 = tail call i32 (...) @foo() #0 + %tmp3 = tail call i32 (...) @foo() #0 + %tmp4 = tail call i32 (...) @foo() #0 + %tmp5 = tail call i32 (...) @foo() #0 + %tmp6 = tail call i32 (...) @foo() #0 + %tmp7 = tail call i32 (...) @foo() #0 + %tmp8 = add nsw i32 %arg, 1 + %tmp9 = tail call i32 @goo(i32 %tmp8) #0 + %tmp10 = tail call i32 (...) @foo() #0 + %tmp11 = icmp eq i32 %tmp10, 0 + br i1 %tmp11, label %bb12, label %bb16 + +bb12: ; preds = %bb1 + %tmp13 = tail call i32 (...) @foo() #0 + %tmp14 = icmp eq i32 %tmp13, 0 + %tmp15 = select i1 %tmp14, i32 0, i32 3 + br label %bb16 + +bb16: ; preds = %bb12, %bb1, %bb + %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ] + ret i32 %tmp17 +} + +; Function Attrs: nounwind +declare i32 @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind +declare i32 @goo(i32) local_unnamed_addr #0 + +; Function Attrs: nounwind +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +bb: +; CHECK-LABEL: @dummy_caller +; CHECK-NOT: br i1 +; CHECK-NOT: call{{.*}}bar_hot_outline_region. +; NOCOST-LABEL: @dummy_caller +; NOCOST: br i1 +; NOCOST: call{{.*}}bar_hot_outline_region. + + %tmp = tail call i32 @bar_hot_outline_region(i32 %arg) + ret i32 %tmp +} + +define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 { +bb: +; CHECK-LABEL: @dummy_caller2 +; CHECK: br i1 +; CHECK: call{{.*}}bar_cold_outline_region. +; NOCOST-LABEL: @dummy_caller2 +; NOCOST: br i1 +; NOCOST: call{{.*}}bar_cold_outline_region. + + %tmp = tail call i32 @bar_cold_outline_region(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301898)"} +!1 = !{!"branch_weights", i32 2000, i32 1} +!2 = !{!"branch_weights", i32 1, i32 100} diff --git a/test/Transforms/CodeExtractor/PartialInlineOr.ll b/test/Transforms/CodeExtractor/PartialInlineOr.ll index 5408b4faaf70..758945c7ade5 100644 --- a/test/Transforms/CodeExtractor/PartialInlineOr.ll +++ b/test/Transforms/CodeExtractor/PartialInlineOr.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s -; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s diff --git a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll index 282d300fadb9..fb6d1c335361 100644 --- a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll +++ b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -partial-inliner -S | FileCheck %s ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s -; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s +; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s diff --git a/test/Transforms/CodeExtractor/SingleCondition.ll b/test/Transforms/CodeExtractor/SingleCondition.ll index 90cda889a21b..4110cd95b7ee 100644 --- a/test/Transforms/CodeExtractor/SingleCondition.ll +++ b/test/Transforms/CodeExtractor/SingleCondition.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s -; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s +; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) { entry: diff --git a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll index 41d883c8c378..0f8a71907d85 100644 --- a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll +++ b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner | llc -filetype=null -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This testcase checks to see if CodeExtractor properly inherits ; target specific attributes for the extracted function. This can ; cause certain instructions that depend on the attributes to not diff --git a/test/Transforms/CodeGenPrepare/section-samplepgo.ll b/test/Transforms/CodeGenPrepare/section-samplepgo.ll new file mode 100644 index 000000000000..93d2a5f2542c --- /dev/null +++ b/test/Transforms/CodeGenPrepare/section-samplepgo.ll @@ -0,0 +1,57 @@ +; RUN: opt < %s -codegenprepare -S | FileCheck %s + +target triple = "x86_64-pc-linux-gnu" + +; This tests that hot/cold functions get correct section prefix assigned + +; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]] +; The entry is hot +define void @hot_func() !prof !15 { + ret void +} + +; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]] +; The sum of 2 callsites are hot +define void @hot_call_func() !prof !16 { + call void @hot_func(), !prof !17 + call void @hot_func(), !prof !17 + ret void +} + +; CHECK-NOT: normal_func{{.*}}!section_prefix +; The sum of all callsites are neither hot or cold +define void @normal_func() !prof !16 { + call void @hot_func(), !prof !17 + call void @hot_func(), !prof !18 + call void @hot_func(), !prof !18 + ret void +} + +; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]] +; The entry and the callsite are both cold +define void @cold_func() !prof !16 { + call void @hot_func(), !prof !18 + ret void +} + +; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"} +; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"} +!llvm.module.flags = !{!1} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"SampleProfile"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"function_entry_count", i64 1000} +!16 = !{!"function_entry_count", i64 1} +!17 = !{!"branch_weights", i32 80} +!18 = !{!"branch_weights", i32 1} diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll index 2c96612e1baf..4f3144e7fc73 100644 --- a/test/Transforms/CodeGenPrepare/section.ll +++ b/test/Transforms/CodeGenPrepare/section.ll @@ -10,32 +10,32 @@ define void @hot_func() !prof !15 { ret void } -; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]] -; The sum of 2 callsites are hot -define void @hot_call_func() !prof !16 { +; For instrumentation based PGO, we should only look at entry counts, +; not call site VP metadata (which can exist on value profiled memcpy, +; or possibly left behind after static analysis based devirtualization). +; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]] +define void @cold_func1() !prof !16 { call void @hot_func(), !prof !17 call void @hot_func(), !prof !17 ret void } -; CHECK-NOT: normal_func{{.*}}!section_prefix -; The sum of all callsites are neither hot or cold -define void @normal_func() !prof !16 { +; CHECK: cold_func2{{.*}}!section_prefix +define void @cold_func2() !prof !16 { call void @hot_func(), !prof !17 call void @hot_func(), !prof !18 call void @hot_func(), !prof !18 ret void } -; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]] -; The entry and the callsite are both cold -define void @cold_func() !prof !16 { +; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]] +define void @cold_func3() !prof !16 { call void @hot_func(), !prof !18 ret void } ; CHECK: ![[HOT_ID]] = !{!"function_section_prefix", !".hot"} -; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".cold"} +; CHECK: ![[COLD_ID]] = !{!"function_section_prefix", !".unlikely"} !llvm.module.flags = !{!1} !1 = !{i32 1, !"ProfileSummary", !2} !2 = !{!3, !4, !5, !6, !7, !8, !9, !10} diff --git a/test/Transforms/ConstProp/calls-math-finite.ll b/test/Transforms/ConstProp/calls-math-finite.ll new file mode 100644 index 000000000000..00041f3e4a4b --- /dev/null +++ b/test/Transforms/ConstProp/calls-math-finite.ll @@ -0,0 +1,83 @@ +; RUN: opt < %s -constprop -S | FileCheck %s + +; Test to verify constant folding can occur when math +; routines are mapped to the __<func>_finite versions +; of functions due to __FINITE_MATH_ONLY__ being +; enabled on headers. All calls should constant +; fold away in this test. + +declare double @__acos_finite(double) #0 +declare float @__acosf_finite(float) #0 +declare double @__asin_finite(double) #0 +declare float @__asinf_finite(float) #0 +declare double @__atan2_finite(double, double) #0 +declare float @__atan2f_finite(float, float) #0 +declare double @__cosh_finite(double) #0 +declare float @__coshf_finite(float) #0 +declare double @__exp2_finite(double) #0 +declare float @__exp2f_finite(float) #0 +declare double @__exp_finite(double) #0 +declare float @__expf_finite(float) #0 +declare double @__log10_finite(double) #0 +declare float @__log10f_finite(float) #0 +declare double @__log_finite(double) #0 +declare float @__logf_finite(float) #0 +declare double @__pow_finite(double, double) #0 +declare float @__powf_finite(float, float) #0 +declare double @__sinh_finite(double) #0 +declare float @__sinhf_finite(float) #0 + +attributes #0 = { nounwind readnone } + +define void @T() { +; CHECK-LABEL: @T( + +; CHECK-NOT: call +; CHECK: ret + + %slot = alloca double + %slotf = alloca float + + %ACOS = call fast double @__acos_finite(double 1.000000e+00) + store double %ACOS, double* %slot + %ASIN = call fast double @__asin_finite(double 1.000000e+00) + store double %ASIN, double* %slot + %ATAN2 = call fast double @__atan2_finite(double 3.000000e+00, double 4.000000e+00) + store double %ATAN2, double* %slot + %COSH = call fast double @__cosh_finite(double 3.000000e+00) + store double %COSH, double* %slot + %EXP = call fast double @__exp_finite(double 3.000000e+00) + store double %EXP, double* %slot + %EXP2 = call fast double @__exp2_finite(double 3.000000e+00) + store double %EXP2, double* %slot + %LOG = call fast double @__log_finite(double 3.000000e+00) + store double %LOG, double* %slot + %LOG10 = call fast double @__log10_finite(double 3.000000e+00) + store double %LOG10, double* %slot + %POW = call fast double @__pow_finite(double 1.000000e+00, double 4.000000e+00) + store double %POW, double* %slot + %SINH = call fast double @__sinh_finite(double 3.000000e+00) + store double %SINH, double* %slot + + %ACOSF = call fast float @__acosf_finite(float 1.000000e+00) + store float %ACOSF, float* %slotf + %ASINF = call fast float @__asinf_finite(float 1.000000e+00) + store float %ASINF, float* %slotf + %ATAN2F = call fast float @__atan2f_finite(float 3.000000e+00, float 4.000000e+00) + store float %ATAN2F, float* %slotf + %COSHF = call fast float @__coshf_finite(float 3.000000e+00) + store float %COSHF, float* %slotf + %EXPF = call fast float @__expf_finite(float 3.000000e+00) + store float %EXPF, float* %slotf + %EXP2F = call fast float @__exp2f_finite(float 3.000000e+00) + store float %EXP2F, float* %slotf + %LOGF = call fast float @__logf_finite(float 3.000000e+00) + store float %LOGF, float* %slotf + %LOG10F = call fast float @__log10f_finite(float 3.000000e+00) + store float %LOG10F, float* %slotf + %POWF = call fast float @__powf_finite(float 3.000000e+00, float 4.000000e+00) + store float %POWF, float* %slotf + %SINHF = call fast float @__sinhf_finite(float 3.000000e+00) + store float %SINHF, float* %slotf + ret void +} diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll index 1175ea522175..161637cc92b8 100644 --- a/test/Transforms/ConstProp/calls.ll +++ b/test/Transforms/ConstProp/calls.ll @@ -184,212 +184,6 @@ define double @T() { ret double %d } -define i1 @test_sse_cvts_exact() nounwind readnone { -; CHECK-LABEL: @test_sse_cvts_exact( -; CHECK-NOT: call -; CHECK: ret i1 true -entry: - %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind - %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind - %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %cmp02 = icmp eq i32 %sum02, 10 - %cmp13 = icmp eq i64 %sum13, 10 - %b = and i1 %cmp02, %cmp13 - ret i1 %b -} - -; Inexact values should not fold as they are dependent on rounding mode -define i1 @test_sse_cvts_inexact() nounwind readnone { -; CHECK-LABEL: @test_sse_cvts_inexact( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind - %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind - %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %cmp02 = icmp eq i32 %sum02, 4 - %cmp13 = icmp eq i64 %sum13, 4 - %b = and i1 %cmp02, %cmp13 - ret i1 %b -} - -; FLT_MAX/DBL_MAX should not fold -define i1 @test_sse_cvts_max() nounwind readnone { -; CHECK-LABEL: @test_sse_cvts_max( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -; INF should not fold -define i1 @test_sse_cvts_inf() nounwind readnone { -; CHECK-LABEL: @test_sse_cvts_inf( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -; NAN should not fold -define i1 @test_sse_cvts_nan() nounwind readnone { -; CHECK-LABEL: @test_sse_cvts_nan( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -define i1 @test_sse_cvtts_exact() nounwind readnone { -; CHECK-LABEL: @test_sse_cvtts_exact( -; CHECK-NOT: call -; CHECK: ret i1 true -entry: - %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind - %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind - %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %cmp02 = icmp eq i32 %sum02, 10 - %cmp13 = icmp eq i64 %sum13, 10 - %b = and i1 %cmp02, %cmp13 - ret i1 %b -} - -define i1 @test_sse_cvtts_inexact() nounwind readnone { -; CHECK-LABEL: @test_sse_cvtts_inexact( -; CHECK-NOT: call -; CHECK: ret i1 true -entry: - %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind - %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind - %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %cmp02 = icmp eq i32 %sum02, 2 - %cmp13 = icmp eq i64 %sum13, 2 - %b = and i1 %cmp02, %cmp13 - ret i1 %b -} - -; FLT_MAX/DBL_MAX should not fold -define i1 @test_sse_cvtts_max() nounwind readnone { -; CHECK-LABEL: @test_sse_cvtts_max( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -; INF should not fold -define i1 @test_sse_cvtts_inf() nounwind readnone { -; CHECK-LABEL: @test_sse_cvtts_inf( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -; NAN should not fold -define i1 @test_sse_cvtts_nan() nounwind readnone { -; CHECK-LABEL: @test_sse_cvtts_nan( -; CHECK: call -; CHECK: call -; CHECK: call -; CHECK: call -entry: - %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float> - %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double> - %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind - %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind - %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind - %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind - %sum02 = add i32 %i0, %i2 - %sum13 = add i64 %i1, %i3 - %sum02.sext = sext i32 %sum02 to i64 - %b = icmp eq i64 %sum02.sext, %sum13 - ret i1 %b -} - -declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone -declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone -declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone -declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone -declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone -declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone -declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone -declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone - define double @test_intrinsic_pow() nounwind uwtable ssp { entry: ; CHECK-LABEL: @test_intrinsic_pow( diff --git a/test/Transforms/ConstProp/sse.ll b/test/Transforms/ConstProp/sse.ll new file mode 100644 index 000000000000..cc37c96c1ff1 --- /dev/null +++ b/test/Transforms/ConstProp/sse.ll @@ -0,0 +1,208 @@ +; RUN: opt < %s -constprop -S | FileCheck %s +; REQUIRES: x86 + +define i1 @test_sse_cvts_exact() nounwind readnone { +; CHECK-LABEL: @test_sse_cvts_exact( +; CHECK-NOT: call +; CHECK: ret i1 true +entry: + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind + %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 7.0, double undef>) nounwind + %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 7.0, double undef>) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %cmp02 = icmp eq i32 %sum02, 10 + %cmp13 = icmp eq i64 %sum13, 10 + %b = and i1 %cmp02, %cmp13 + ret i1 %b +} + +; Inexact values should not fold as they are dependent on rounding mode +define i1 @test_sse_cvts_inexact() nounwind readnone { +; CHECK-LABEL: @test_sse_cvts_inexact( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind + %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> <double 1.75, double undef>) nounwind + %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> <double 1.75, double undef>) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %cmp02 = icmp eq i32 %sum02, 4 + %cmp13 = icmp eq i64 %sum13, 4 + %b = and i1 %cmp02, %cmp13 + ret i1 %b +} + +; FLT_MAX/DBL_MAX should not fold +define i1 @test_sse_cvts_max() nounwind readnone { +; CHECK-LABEL: @test_sse_cvts_max( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +; INF should not fold +define i1 @test_sse_cvts_inf() nounwind readnone { +; CHECK-LABEL: @test_sse_cvts_inf( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +; NAN should not fold +define i1 @test_sse_cvts_nan() nounwind readnone { +; CHECK-LABEL: @test_sse_cvts_nan( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +define i1 @test_sse_cvtts_exact() nounwind readnone { +; CHECK-LABEL: @test_sse_cvtts_exact( +; CHECK-NOT: call +; CHECK: ret i1 true +entry: + %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 3.0, float undef, float undef, float undef>) nounwind + %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 7.0, double undef>) nounwind + %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 7.0, double undef>) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %cmp02 = icmp eq i32 %sum02, 10 + %cmp13 = icmp eq i64 %sum13, 10 + %b = and i1 %cmp02, %cmp13 + ret i1 %b +} + +define i1 @test_sse_cvtts_inexact() nounwind readnone { +; CHECK-LABEL: @test_sse_cvtts_inexact( +; CHECK-NOT: call +; CHECK: ret i1 true +entry: + %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> <float 1.75, float undef, float undef, float undef>) nounwind + %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> <double 1.75, double undef>) nounwind + %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> <double 1.75, double undef>) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %cmp02 = icmp eq i32 %sum02, 2 + %cmp13 = icmp eq i64 %sum13, 2 + %b = and i1 %cmp02, %cmp13 + ret i1 %b +} + +; FLT_MAX/DBL_MAX should not fold +define i1 @test_sse_cvtts_max() nounwind readnone { +; CHECK-LABEL: @test_sse_cvtts_max( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2139095039, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9218868437227405311, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +; INF should not fold +define i1 @test_sse_cvtts_inf() nounwind readnone { +; CHECK-LABEL: @test_sse_cvtts_inf( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2139095040, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9218868437227405312, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +; NAN should not fold +define i1 @test_sse_cvtts_nan() nounwind readnone { +; CHECK-LABEL: @test_sse_cvtts_nan( +; CHECK: call +; CHECK: call +; CHECK: call +; CHECK: call +entry: + %fm = bitcast <4 x i32> <i32 2143289344, i32 undef, i32 undef, i32 undef> to <4 x float> + %dm = bitcast <2 x i64> <i64 9221120237041090560, i64 undef> to <2 x double> + %i0 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %fm) nounwind + %i1 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %fm) nounwind + %i2 = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %dm) nounwind + %i3 = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %dm) nounwind + %sum02 = add i32 %i0, %i2 + %sum13 = add i64 %i1, %i3 + %sum02.sext = sext i32 %sum02 to i64 + %b = icmp eq i64 %sum02.sext, %sum13 + ret i1 %b +} + +declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone +declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone +declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone +declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone +declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone +declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone diff --git a/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll new file mode 100644 index 000000000000..5da0e3c199db --- /dev/null +++ b/test/Transforms/Coroutines/coro-eh-aware-edge-split.ll @@ -0,0 +1,218 @@ +; Check that we can handle edge splits leading into a landingpad +; RUN: opt < %s -coro-split -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: define internal fastcc void @f.resume( +define void @f(i1 %cond) "coroutine.presplit"="1" personality i32 0 { +entry: + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null) + %size = tail call i64 @llvm.coro.size.i64() + %alloc = call i8* @malloc(i64 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + %sp = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp, label %coro.ret [ + i8 0, label %resume + i8 1, label %cleanup + ] + +resume: + br i1 %cond, label %invoke1, label %invoke2 + +invoke1: + invoke void @may_throw1() + to label %unreach unwind label %pad.with.phi +invoke2: + invoke void @may_throw2() + to label %unreach unwind label %pad.with.phi + +; Verify that we cloned landing pad on every edge and inserted a reload of the spilled value + +; CHECK: pad.with.phi.from.invoke2: +; CHECK: %0 = landingpad { i8*, i32 } +; CHECK: catch i8* null +; CHECK: br label %pad.with.phi + +; CHECK: pad.with.phi.from.invoke1: +; CHECK: %1 = landingpad { i8*, i32 } +; CHECK: catch i8* null +; CHECK: br label %pad.with.phi + +; CHECK: pad.with.phi: +; CHECK: %val = phi i32 [ 0, %pad.with.phi.from.invoke1 ], [ 1, %pad.with.phi.from.invoke2 ] +; CHECK: %lp = phi { i8*, i32 } [ %0, %pad.with.phi.from.invoke2 ], [ %1, %pad.with.phi.from.invoke1 ] +; CHECK: %exn = extractvalue { i8*, i32 } %lp, 0 +; CHECK: call i8* @__cxa_begin_catch(i8* %exn) +; CHECK: call void @use_val(i32 %val) +; CHECK: call void @__cxa_end_catch() +; CHECK: call void @free(i8* %vFrame) +; CHECK: ret void + +pad.with.phi: + %val = phi i32 [ 0, %invoke1 ], [ 1, %invoke2 ] + %lp = landingpad { i8*, i32 } + catch i8* null + %exn = extractvalue { i8*, i32 } %lp, 0 + call i8* @__cxa_begin_catch(i8* %exn) + call void @use_val(i32 %val) + call void @__cxa_end_catch() + br label %cleanup + +cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %coro.ret + +coro.ret: + call i1 @llvm.coro.end(i8* null, i1 false) + ret void + +unreach: + unreachable +} + +; CHECK-LABEL: define internal fastcc void @g.resume( +define void @g(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 { +entry: + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null) + %size = tail call i64 @llvm.coro.size.i64() + %alloc = call i8* @malloc(i64 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + %sp = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp, label %coro.ret [ + i8 0, label %resume + i8 1, label %cleanup + ] + +resume: + br i1 %cond, label %invoke1, label %invoke2 + +invoke1: + invoke void @may_throw1() + to label %unreach unwind label %pad.with.phi +invoke2: + invoke void @may_throw2() + to label %unreach unwind label %pad.with.phi + +; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value + +; CHECK: pad.with.phi.from.invoke2: +; CHECK: %0 = cleanuppad within none [] +; CHECK: %y.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 6 +; CHECK: %y.reload = load i32, i32* %y.reload.addr +; CHECK: cleanupret from %0 unwind label %pad.with.phi + +; CHECK: pad.with.phi.from.invoke1: +; CHECK: %1 = cleanuppad within none [] +; CHECK: %x.reload.addr = getelementptr inbounds %g.Frame, %g.Frame* %FramePtr, i32 0, i32 5 +; CHECK: %x.reload = load i32, i32* %x.reload.addr +; CHECK: cleanupret from %1 unwind label %pad.with.phi + +; CHECK: pad.with.phi: +; CHECK: %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ] +; CHECK: %tok = cleanuppad within none [] +; CHECK: call void @use_val(i32 %val) +; CHECK: cleanupret from %tok unwind to caller + +pad.with.phi: + %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ] + %tok = cleanuppad within none [] + call void @use_val(i32 %val) + cleanupret from %tok unwind to caller + +cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %coro.ret + +coro.ret: + call i1 @llvm.coro.end(i8* null, i1 false) + ret void + +unreach: + unreachable +} + +; CHECK-LABEL: define internal fastcc void @h.resume( +define void @h(i1 %cond, i32 %x, i32 %y) "coroutine.presplit"="1" personality i32 0 { +entry: + %id = call token @llvm.coro.id(i32 16, i8* null, i8* null, i8* null) + %size = tail call i64 @llvm.coro.size.i64() + %alloc = call i8* @malloc(i64 %size) + %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc) + %sp = call i8 @llvm.coro.suspend(token none, i1 false) + switch i8 %sp, label %coro.ret [ + i8 0, label %resume + i8 1, label %cleanup + ] + +resume: + br i1 %cond, label %invoke1, label %invoke2 + +invoke1: + invoke void @may_throw1() + to label %coro.ret unwind label %pad.with.phi +invoke2: + invoke void @may_throw2() + to label %coro.ret unwind label %pad.with.phi + +; Verify that we created cleanuppads on every edge and inserted a reload of the spilled value + +; CHECK: pad.with.phi.from.invoke2: +; CHECK: %0 = cleanuppad within none [] +; CHECK: %y.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 6 +; CHECK: %y.reload = load i32, i32* %y.reload.addr +; CHECK: cleanupret from %0 unwind label %pad.with.phi + +; CHECK: pad.with.phi.from.invoke1: +; CHECK: %1 = cleanuppad within none [] +; CHECK: %x.reload.addr = getelementptr inbounds %h.Frame, %h.Frame* %FramePtr, i32 0, i32 5 +; CHECK: %x.reload = load i32, i32* %x.reload.addr +; CHECK: cleanupret from %1 unwind label %pad.with.phi + +; CHECK: pad.with.phi: +; CHECK: %val = phi i32 [ %x.reload, %pad.with.phi.from.invoke1 ], [ %y.reload, %pad.with.phi.from.invoke2 ] +; CHECK: %switch = catchswitch within none [label %catch] unwind to caller +pad.with.phi: + %val = phi i32 [ %x, %invoke1 ], [ %y, %invoke2 ] + %switch = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %pad = catchpad within %switch [i8* null, i32 64, i8* null] + call void @use_val(i32 %val) + catchret from %pad to label %coro.ret + +cleanup: ; preds = %invoke.cont15, %if.else, %if.then, %ehcleanup21, %init.suspend + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %coro.ret + +coro.ret: + call i1 @llvm.coro.end(i8* null, i1 false) + ret void +} + +; Function Attrs: argmemonly nounwind readonly +declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) +declare noalias i8* @malloc(i64) +declare i64 @llvm.coro.size.i64() +declare i8* @llvm.coro.begin(token, i8* writeonly) + +; Function Attrs: nounwind +declare token @llvm.coro.save(i8*) +declare i8 @llvm.coro.suspend(token, i1) + +; Function Attrs: argmemonly nounwind +declare void @may_throw1() +declare void @may_throw2() + +declare i8* @__cxa_begin_catch(i8*) + +declare void @use_val(i32) +declare void @__cxa_end_catch() + +; Function Attrs: nounwind +declare i1 @llvm.coro.end(i8*, i1) +declare void @free(i8*) +declare i8* @llvm.coro.free(token, i8* nocapture readonly) diff --git a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll index 0769575759ba..05dc79db95ad 100644 --- a/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll +++ b/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll @@ -5,8 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-macosx10.7.0" - -define i1 @rb_intern() nounwind ssp { +define i1 @rb_intern(i8 *%foo) nounwind ssp { ; CHECK-LABEL: @rb_intern( bb: @@ -19,7 +18,7 @@ bb1: br i1 undef, label %bb3, label %bb15 ; CHECK: bb1: -; CHECK: [[TMP:%.*]] = phi i8* [ getelementptr (i8, i8* null, i64 undef), %bb10 ], [ null, %bb ] +; CHECK: [[TMP:%.*]] = phi i8* [ %tmp14, %bb10 ], [ null, %bb ] ; CHECK: bb1.bb15_crit_edge: ; CHECK: %tmp17.pre = load i8, i8* [[TMP]], align 1 @@ -41,7 +40,7 @@ bb10: %tmp11 = load i8*, i8** %tmp, align 8 %tmp12 = load i8, i8* %tmp11, align 1 %tmp13 = zext i8 %tmp12 to i64 - %tmp14 = getelementptr inbounds i8, i8* null, i64 undef + %tmp14 = getelementptr inbounds i8, i8* %foo, i64 undef store i8* %tmp14, i8** %tmp, align 8 br label %bb1 diff --git a/test/Transforms/GVN/PRE/nonintegral.ll b/test/Transforms/GVN/PRE/nonintegral.ll new file mode 100644 index 000000000000..75a756e8af8c --- /dev/null +++ b/test/Transforms/GVN/PRE/nonintegral.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -gvn -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4" +target triple = "x86_64-unknown-linux-gnu" + +define void @nipre(double addrspace(4)** noalias %p, i64 addrspace(4)** noalias %p2, i8 %jmp) { + +; CHECK-LABEL: @nipre( +; CHECK: [[PCAST:%.*]] = bitcast double addrspace(4)** [[P:%.*]] to i64 addrspace(4)** +; CHECK: a: +; CHECK: [[L1:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]] +; CHECK: [[TMP0:%.*]] = bitcast i64 addrspace(4)* [[L1]] to double addrspace(4)* +; CHECK: b: +; CHECK: [[L2:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)** [[PCAST]] +; CHECK: [[TMP1:%.*]] = bitcast i64 addrspace(4)* [[L2]] to double addrspace(4)* +; CHECK: c: +; CHECK-NEXT: [[L3_PRE:%.*]] = load double addrspace(4)*, double addrspace(4)** %p + +entry: + %pcast = bitcast double addrspace(4)** %p to i64 addrspace(4)** + switch i8 %jmp, label %c [ i8 0, label %a + i8 1, label %b] +a: + %l1 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast + store i64 addrspace(4)* %l1, i64 addrspace(4)** %p2 + br label %tail +b: + %l2 = load i64 addrspace(4)*, i64 addrspace(4)** %pcast + store i64 addrspace(4)* %l2, i64 addrspace(4)** %p2 + br label %tail +c: + br label %tail +tail: + %l3 = load double addrspace(4)*, double addrspace(4)** %p + %l3cast = bitcast double addrspace(4)* %l3 to i64 addrspace(4)* + store i64 addrspace(4)* %l3cast, i64 addrspace(4)** %p2 + ret void +} diff --git a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll index 3d77a364f96f..49e5d24296c0 100644 --- a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll +++ b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll @@ -6,7 +6,7 @@ target triple = "thumbv7-apple-darwin" ; CHECK-LABEL: @test( ; CHECK: if.end.i126: -; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, getelementptr (i8, i8* null, i32 undef) +; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, null define void @test() nounwind { entry: br label %while.cond diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll index 64676bf310bd..cb4b5cdd1e8c 100644 --- a/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/test/Transforms/InferFunctionAttrs/annotate.ll @@ -22,12 +22,138 @@ declare i32 @__nvvm_reflect(i8*) ; Use an opaque pointer type for all the (possibly opaque) structs. %opaque = type opaque +; CHECK: declare double @__acos_finite(double) +declare double @__acos_finite(double) + +; CHECK: declare float @__acosf_finite(float) +declare float @__acosf_finite(float) + +; CHECK: declare double @__acosh_finite(double) +declare double @__acosh_finite(double) + +; CHECK: declare float @__acoshf_finite(float) +declare float @__acoshf_finite(float) + +; CHECK: declare x86_fp80 @__acoshl_finite(x86_fp80) +declare x86_fp80 @__acoshl_finite(x86_fp80) + +; CHECK: declare x86_fp80 @__acosl_finite(x86_fp80) +declare x86_fp80 @__acosl_finite(x86_fp80) + +; CHECK: declare double @__asin_finite(double) +declare double @__asin_finite(double) + +; CHECK: declare float @__asinf_finite(float) +declare float @__asinf_finite(float) + +; CHECK: declare x86_fp80 @__asinl_finite(x86_fp80) +declare x86_fp80 @__asinl_finite(x86_fp80) + +; CHECK: declare double @__atan2_finite(double, double) +declare double @__atan2_finite(double, double) + +; CHECK: declare float @__atan2f_finite(float, float) +declare float @__atan2f_finite(float, float) + +; CHECK: declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80) +declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80) + +; CHECK: declare double @__atanh_finite(double) +declare double @__atanh_finite(double) + +; CHECK: declare float @__atanhf_finite(float) +declare float @__atanhf_finite(float) + +; CHECK: declare x86_fp80 @__atanhl_finite(x86_fp80) +declare x86_fp80 @__atanhl_finite(x86_fp80) + +; CHECK: declare double @__cosh_finite(double) +declare double @__cosh_finite(double) + +; CHECK: declare float @__coshf_finite(float) +declare float @__coshf_finite(float) + +; CHECK: declare x86_fp80 @__coshl_finite(x86_fp80) +declare x86_fp80 @__coshl_finite(x86_fp80) + ; CHECK: declare double @__cospi(double) declare double @__cospi(double) ; CHECK: declare float @__cospif(float) declare float @__cospif(float) +; CHECK: declare double @__exp10_finite(double) +declare double @__exp10_finite(double) + +; CHECK: declare float @__exp10f_finite(float) +declare float @__exp10f_finite(float) + +; CHECK: declare x86_fp80 @__exp10l_finite(x86_fp80) +declare x86_fp80 @__exp10l_finite(x86_fp80) + +; CHECK: declare double @__exp2_finite(double) +declare double @__exp2_finite(double) + +; CHECK: declare float @__exp2f_finite(float) +declare float @__exp2f_finite(float) + +; CHECK: declare x86_fp80 @__exp2l_finite(x86_fp80) +declare x86_fp80 @__exp2l_finite(x86_fp80) + +; CHECK: declare double @__exp_finite(double) +declare double @__exp_finite(double) + +; CHECK: declare float @__expf_finite(float) +declare float @__expf_finite(float) + +; CHECK: declare x86_fp80 @__expl_finite(x86_fp80) +declare x86_fp80 @__expl_finite(x86_fp80) + +; CHECK: declare double @__log10_finite(double) +declare double @__log10_finite(double) + +; CHECK: declare float @__log10f_finite(float) +declare float @__log10f_finite(float) + +; CHECK: declare x86_fp80 @__log10l_finite(x86_fp80) +declare x86_fp80 @__log10l_finite(x86_fp80) + +; CHECK: declare double @__log2_finite(double) +declare double @__log2_finite(double) + +; CHECK: declare float @__log2f_finite(float) +declare float @__log2f_finite(float) + +; CHECK: declare x86_fp80 @__log2l_finite(x86_fp80) +declare x86_fp80 @__log2l_finite(x86_fp80) + +; CHECK: declare double @__log_finite(double) +declare double @__log_finite(double) + +; CHECK: declare float @__logf_finite(float) +declare float @__logf_finite(float) + +; CHECK: declare x86_fp80 @__logl_finite(x86_fp80) +declare x86_fp80 @__logl_finite(x86_fp80) + +; CHECK: declare double @__pow_finite(double, double) +declare double @__pow_finite(double, double) + +; CHECK: declare float @__powf_finite(float, float) +declare float @__powf_finite(float, float) + +; CHECK: declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80) +declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80) + +; CHECK: declare double @__sinh_finite(double) +declare double @__sinh_finite(double) + +; CHECK: declare float @__sinhf_finite(float) +declare float @__sinhf_finite(float) + +; CHECK: declare x86_fp80 @__sinhl_finite(x86_fp80) +declare x86_fp80 @__sinhl_finite(x86_fp80) + ; CHECK: declare double @__sinpi(double) declare double @__sinpi(double) diff --git a/test/Transforms/InferFunctionAttrs/no-proto.ll b/test/Transforms/InferFunctionAttrs/no-proto.ll index 25a4805c367f..3cab0ab4bf40 100644 --- a/test/Transforms/InferFunctionAttrs/no-proto.ll +++ b/test/Transforms/InferFunctionAttrs/no-proto.ll @@ -3,12 +3,138 @@ ; Check that we don't modify libc functions with invalid prototypes. +; CHECK: declare void @__acos_finite(...) +declare void @__acos_finite(...) + +; CHECK: declare void @__acosf_finite(...) +declare void @__acosf_finite(...) + +; CHECK: declare void @__acosh_finite(...) +declare void @__acosh_finite(...) + +; CHECK: declare void @__acoshf_finite(...) +declare void @__acoshf_finite(...) + +; CHECK: declare void @__acoshl_finite(...) +declare void @__acoshl_finite(...) + +; CHECK: declare void @__acosl_finite(...) +declare void @__acosl_finite(...) + +; CHECK: declare void @__asin_finite(...) +declare void @__asin_finite(...) + +; CHECK: declare void @__asinf_finite(...) +declare void @__asinf_finite(...) + +; CHECK: declare void @__asinl_finite(...) +declare void @__asinl_finite(...) + +; CHECK: declare void @__atan2_finite(...) +declare void @__atan2_finite(...) + +; CHECK: declare void @__atan2f_finite(...) +declare void @__atan2f_finite(...) + +; CHECK: declare void @__atan2l_finite(...) +declare void @__atan2l_finite(...) + +; CHECK: declare void @__atanh_finite(...) +declare void @__atanh_finite(...) + +; CHECK: declare void @__atanhf_finite(...) +declare void @__atanhf_finite(...) + +; CHECK: declare void @__atanhl_finite(...) +declare void @__atanhl_finite(...) + +; CHECK: declare void @__cosh_finite(...) +declare void @__cosh_finite(...) + +; CHECK: declare void @__coshf_finite(...) +declare void @__coshf_finite(...) + +; CHECK: declare void @__coshl_finite(...) +declare void @__coshl_finite(...) + ; CHECK: declare void @__cospi(...) declare void @__cospi(...) ; CHECK: declare void @__cospif(...) declare void @__cospif(...) +; CHECK: declare void @__exp10_finite(...) +declare void @__exp10_finite(...) + +; CHECK: declare void @__exp10f_finite(...) +declare void @__exp10f_finite(...) + +; CHECK: declare void @__exp10l_finite(...) +declare void @__exp10l_finite(...) + +; CHECK: declare void @__exp2_finite(...) +declare void @__exp2_finite(...) + +; CHECK: declare void @__exp2f_finite(...) +declare void @__exp2f_finite(...) + +; CHECK: declare void @__exp2l_finite(...) +declare void @__exp2l_finite(...) + +; CHECK: declare void @__exp_finite(...) +declare void @__exp_finite(...) + +; CHECK: declare void @__expf_finite(...) +declare void @__expf_finite(...) + +; CHECK: declare void @__expl_finite(...) +declare void @__expl_finite(...) + +; CHECK: declare void @__log10_finite(...) +declare void @__log10_finite(...) + +; CHECK: declare void @__log10f_finite(...) +declare void @__log10f_finite(...) + +; CHECK: declare void @__log10l_finite(...) +declare void @__log10l_finite(...) + +; CHECK: declare void @__log2_finite(...) +declare void @__log2_finite(...) + +; CHECK: declare void @__log2f_finite(...) +declare void @__log2f_finite(...) + +; CHECK: declare void @__log2l_finite(...) +declare void @__log2l_finite(...) + +; CHECK: declare void @__log_finite(...) +declare void @__log_finite(...) + +; CHECK: declare void @__logf_finite(...) +declare void @__logf_finite(...) + +; CHECK: declare void @__logl_finite(...) +declare void @__logl_finite(...) + +; CHECK: declare void @__pow_finite(...) +declare void @__pow_finite(...) + +; CHECK: declare void @__powf_finite(...) +declare void @__powf_finite(...) + +; CHECK: declare void @__powl_finite(...) +declare void @__powl_finite(...) + +; CHECK: declare void @__sinh_finite(...) +declare void @__sinh_finite(...) + +; CHECK: declare void @__sinhf_finite(...) +declare void @__sinhf_finite(...) + +; CHECK: declare void @__sinhl_finite(...) +declare void @__sinhl_finite(...) + ; CHECK: declare void @__sinpi(...) declare void @__sinpi(...) diff --git a/test/Transforms/Inline/inline-cold.ll b/test/Transforms/Inline/inline-cold.ll index 93d2569d87ad..e0e679ad4036 100644 --- a/test/Transforms/Inline/inline-cold.ll +++ b/test/Transforms/Inline/inline-cold.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -inline -S -inlinecold-threshold=75 | FileCheck %s +; RUN: opt < %s -inline -S -inlinecold-threshold=25 | FileCheck %s ; Test that functions with attribute Cold are not inlined while the ; same function without attribute Cold will be inlined. @@ -64,23 +64,7 @@ entry: %x3 = add i32 %x2, %a3 %a4 = load volatile i32, i32* @a %x4 = add i32 %x3, %a4 - %a5 = load volatile i32, i32* @a - %x5 = add i32 %x4, %a5 - %a6 = load volatile i32, i32* @a - %x6 = add i32 %x5, %a6 - %a7 = load volatile i32, i32* @a - %x7 = add i32 %x6, %a6 - %a8 = load volatile i32, i32* @a - %x8 = add i32 %x7, %a8 - %a9 = load volatile i32, i32* @a - %x9 = add i32 %x8, %a9 - %a10 = load volatile i32, i32* @a - %x10 = add i32 %x9, %a10 - %a11 = load volatile i32, i32* @a - %x11 = add i32 %x10, %a11 - %a12 = load volatile i32, i32* @a - %x12 = add i32 %x11, %a12 - %add = add i32 %x12, %a + %add = add i32 %x4, %a ret i32 %add } diff --git a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll index 1f2b143c97ee..b8d41abe1c35 100644 --- a/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll +++ b/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll @@ -6,7 +6,7 @@ target datalayout = "e-p3:32:32-p4:64:64-n32" @lds = internal addrspace(3) global [64 x i64] zeroinitializer ; CHECK-LABEL: @constexpr_addrspacecast_ptr_size_change( -; CHECK: load i64, i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*), i64 undef) +; CHECK: load i64, i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*) ; CHECK-NEXT: br define void @constexpr_addrspacecast_ptr_size_change() #0 { %tmp0 = call i32 @foo(i64 addrspace(4)* addrspacecast (i64 addrspace(3)* getelementptr inbounds ([64 x i64], [64 x i64] addrspace(3)* @lds, i32 0, i32 0) to i64 addrspace(4)*)) #1 diff --git a/test/Transforms/Inline/partial-inline-act.ll b/test/Transforms/Inline/partial-inline-act.ll index 916436260bd6..27e719153875 100644 --- a/test/Transforms/Inline/partial-inline-act.ll +++ b/test/Transforms/Inline/partial-inline-act.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -disable-output +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -disable-output ; This testcase tests the assumption cache define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) { diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update.ll index 3fefa1c56cea..4a4471e8e17a 100644 --- a/test/Transforms/Inline/prof-update.ll +++ b/test/Transforms/Inline/prof-update.ll @@ -6,21 +6,21 @@ declare void @ext1(); @func = global void ()* null ; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]] -define void @callee(i32 %n) !prof !1 { +define void @callee(i32 %n) !prof !15 { %cond = icmp sle i32 %n, 10 br i1 %cond, label %cond_true, label %cond_false cond_true: ; ext1 is optimized away, thus not updated. ; CHECK: call void @ext1(), !prof ![[COUNT_CALLEE1:[0-9]*]] - call void @ext1(), !prof !2 + call void @ext1(), !prof !16 ret void cond_false: ; ext is cloned and updated. ; CHECK: call void @ext(), !prof ![[COUNT_CALLEE:[0-9]*]] - call void @ext(), !prof !2 + call void @ext(), !prof !16 %f = load void ()*, void ()** @func ; CHECK: call void %f(), !prof ![[COUNT_IND_CALLEE:[0-9]*]] - call void %f(), !prof !4 + call void %f(), !prof !18 ret void } @@ -28,16 +28,29 @@ cond_false: define void @caller() { ; CHECK: call void @ext(), !prof ![[COUNT_CALLER:[0-9]*]] ; CHECK: call void %f.i(), !prof ![[COUNT_IND_CALLER:[0-9]*]] - call void @callee(i32 15), !prof !3 + call void @callee(i32 15), !prof !17 ret void } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"MaxFunctionCount", i32 2000} -!1 = !{!"function_entry_count", i64 1000} -!2 = !{!"branch_weights", i64 2000} -!3 = !{!"branch_weights", i64 400} -!4 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20} +!llvm.module.flags = !{!1} +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"SampleProfile"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 10} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 2000} +!8 = !{!"NumCounts", i64 2} +!9 = !{!"NumFunctions", i64 2} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"function_entry_count", i64 1000} +!16 = !{!"branch_weights", i64 2000} +!17 = !{!"branch_weights", i64 400} +!18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20} attributes #0 = { alwaysinline } ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600} ; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i64 2000} diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll index 39408a2d394c..04fb7d91193a 100644 --- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll +++ b/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll @@ -1,70 +1,6 @@ ; RUN: opt -S -instcombine < %s | FileCheck %s - -define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { -entry: - %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind - ret <4 x i32> %a -; CHECK: entry: -; CHECK-NEXT: ret <4 x i32> zeroinitializer -} - -define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp { -entry: - %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind - ret <4 x i32> %a -; CHECK: entry: -; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32> -; CHECK-NEXT: ret <4 x i32> %a -} - -define <4 x i32> @constantMul() nounwind readnone ssp { -entry: - %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind - ret <4 x i32> %a -; CHECK: entry: -; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6> -} - -define <4 x i32> @constantMulS() nounwind readnone ssp { -entry: - %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind - ret <4 x i32> %b -; CHECK: entry: -; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> -} - -define <4 x i32> @constantMulU() nounwind readnone ssp { -entry: - %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind - ret <4 x i32> %b -; CHECK: entry: -; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> -} - -define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp { -entry: - %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind - %b = add <4 x i32> zeroinitializer, %a - ret <4 x i32> %b -; CHECK: entry: -; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]] -; CHECK-NEXT: ret <4 x i32> %a -} - -define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp { -entry: - %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind - %b = add <4 x i32> %x, %a - ret <4 x i32> %b -; CHECK: entry: -; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6> -; CHECK-NEXT: ret <4 x i32> %b -} - -declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone - -; ARM64 variants - <rdar://problem/12349617> +; ARM64 neon intrinsic variants - <rdar://problem/12349617> +; REQUIRES: aarch64 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp { entry: diff --git a/test/Transforms/InstCombine/AArch64/lit.local.cfg b/test/Transforms/InstCombine/AArch64/lit.local.cfg new file mode 100644 index 000000000000..7184443994b6 --- /dev/null +++ b/test/Transforms/InstCombine/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 1901997c5521..1901997c5521 100644 --- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll +++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll diff --git a/test/Transforms/InstCombine/AMDGPU/lit.local.cfg b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..2a665f06be72 --- /dev/null +++ b/test/Transforms/InstCombine/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll new file mode 100644 index 000000000000..9efed367d19f --- /dev/null +++ b/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll @@ -0,0 +1,65 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +} + +define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @constantMul() nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6> +} + +define <4 x i32> @constantMulS() nounwind readnone ssp { +entry: + %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> +} + +define <4 x i32> @constantMulU() nounwind readnone ssp { +entry: + %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> +} + +define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind + %b = add <4 x i32> zeroinitializer, %a + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]] +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind + %b = add <4 x i32> %x, %a + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6> +; CHECK-NEXT: ret <4 x i32> %b +} + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone diff --git a/test/Transforms/InstCombine/constant-fold-hang.ll b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll index 2ca6b86ccc2f..2ca6b86ccc2f 100644 --- a/test/Transforms/InstCombine/constant-fold-hang.ll +++ b/test/Transforms/InstCombine/ARM/constant-fold-hang.ll diff --git a/test/Transforms/InstCombine/ARM/lit.local.cfg b/test/Transforms/InstCombine/ARM/lit.local.cfg new file mode 100644 index 000000000000..236e1d344166 --- /dev/null +++ b/test/Transforms/InstCombine/ARM/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/InstCombine/neon-intrinsics.ll b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll index d22fa9c811dc..d22fa9c811dc 100644 --- a/test/Transforms/InstCombine/neon-intrinsics.ll +++ b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll diff --git a/test/Transforms/InstCombine/aligned-altivec.ll b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll index 10b4e4d62631..10b4e4d62631 100644 --- a/test/Transforms/InstCombine/aligned-altivec.ll +++ b/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll diff --git a/test/Transforms/InstCombine/aligned-qpx.ll b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll index e9710df5670c..e9710df5670c 100644 --- a/test/Transforms/InstCombine/aligned-qpx.ll +++ b/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll diff --git a/test/Transforms/InstCombine/PowerPC/lit.local.cfg b/test/Transforms/InstCombine/PowerPC/lit.local.cfg new file mode 100644 index 000000000000..5d33887ff0a4 --- /dev/null +++ b/test/Transforms/InstCombine/PowerPC/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'PowerPC' in config.root.targets: + config.unsupported = True + diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll index ad264fb15b31..ad264fb15b31 100644 --- a/test/Transforms/InstCombine/vsx-unaligned.ll +++ b/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll diff --git a/test/Transforms/InstCombine/X86FsubCmpCombine.ll b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll index fde0692d00a2..fde0692d00a2 100644 --- a/test/Transforms/InstCombine/X86FsubCmpCombine.ll +++ b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/X86/blend_x86.ll index 39ceb0186efe..39ceb0186efe 100644 --- a/test/Transforms/InstCombine/blend_x86.ll +++ b/test/Transforms/InstCombine/X86/blend_x86.ll diff --git a/test/Transforms/InstCombine/X86/lit.local.cfg b/test/Transforms/InstCombine/X86/lit.local.cfg new file mode 100644 index 000000000000..c8625f4d9d24 --- /dev/null +++ b/test/Transforms/InstCombine/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/InstCombine/pr2645-1.ll b/test/Transforms/InstCombine/X86/pr2645-1.ll index 2986d21866bf..2986d21866bf 100644 --- a/test/Transforms/InstCombine/pr2645-1.ll +++ b/test/Transforms/InstCombine/X86/pr2645-1.ll diff --git a/test/Transforms/InstCombine/shufflemask-undef.ll b/test/Transforms/InstCombine/X86/shufflemask-undef.ll index 10509a92941b..d95c42da5f7e 100644 --- a/test/Transforms/InstCombine/shufflemask-undef.ll +++ b/test/Transforms/InstCombine/X86/shufflemask-undef.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -instcombine -S | not grep "shufflevector.*i32 8" +; RUN: opt < %s -instcombine -S | FileCheck %s +; CHECK-NOT: shufflevector{{.*}}i32 8" target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin9" diff --git a/test/Transforms/InstCombine/x86-avx2.ll b/test/Transforms/InstCombine/X86/x86-avx2.ll index f4045f788e2d..f4045f788e2d 100644 --- a/test/Transforms/InstCombine/x86-avx2.ll +++ b/test/Transforms/InstCombine/X86/x86-avx2.ll diff --git a/test/Transforms/InstCombine/x86-avx512.ll b/test/Transforms/InstCombine/X86/x86-avx512.ll index 2a24d93ce76a..2a24d93ce76a 100644 --- a/test/Transforms/InstCombine/x86-avx512.ll +++ b/test/Transforms/InstCombine/X86/x86-avx512.ll diff --git a/test/Transforms/InstCombine/x86-crc32-demanded.ll b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll index 878b97d1bb22..878b97d1bb22 100644 --- a/test/Transforms/InstCombine/x86-crc32-demanded.ll +++ b/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll diff --git a/test/Transforms/InstCombine/x86-f16c.ll b/test/Transforms/InstCombine/X86/x86-f16c.ll index 6b5b6cb26eda..6b5b6cb26eda 100644 --- a/test/Transforms/InstCombine/x86-f16c.ll +++ b/test/Transforms/InstCombine/X86/x86-f16c.ll diff --git a/test/Transforms/InstCombine/x86-fma.ll b/test/Transforms/InstCombine/X86/x86-fma.ll index 0d27d3276163..0d27d3276163 100644 --- a/test/Transforms/InstCombine/x86-fma.ll +++ b/test/Transforms/InstCombine/X86/x86-fma.ll diff --git a/test/Transforms/InstCombine/x86-insertps.ll b/test/Transforms/InstCombine/X86/x86-insertps.ll index f55ea6f22d2e..f55ea6f22d2e 100644 --- a/test/Transforms/InstCombine/x86-insertps.ll +++ b/test/Transforms/InstCombine/X86/x86-insertps.ll diff --git a/test/Transforms/InstCombine/x86-masked-memops.ll b/test/Transforms/InstCombine/X86/x86-masked-memops.ll index 8502b1899ecb..8502b1899ecb 100644 --- a/test/Transforms/InstCombine/x86-masked-memops.ll +++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll diff --git a/test/Transforms/InstCombine/x86-movmsk.ll b/test/Transforms/InstCombine/X86/x86-movmsk.ll index 11acc1dbca84..11acc1dbca84 100644 --- a/test/Transforms/InstCombine/x86-movmsk.ll +++ b/test/Transforms/InstCombine/X86/x86-movmsk.ll diff --git a/test/Transforms/InstCombine/x86-muldq.ll b/test/Transforms/InstCombine/X86/x86-muldq.ll index bcbb8919c403..bcbb8919c403 100644 --- a/test/Transforms/InstCombine/x86-muldq.ll +++ b/test/Transforms/InstCombine/X86/x86-muldq.ll diff --git a/test/Transforms/InstCombine/x86-pack.ll b/test/Transforms/InstCombine/X86/x86-pack.ll index f3c41a8aa476..f3c41a8aa476 100644 --- a/test/Transforms/InstCombine/x86-pack.ll +++ b/test/Transforms/InstCombine/X86/x86-pack.ll diff --git a/test/Transforms/InstCombine/x86-pshufb.ll b/test/Transforms/InstCombine/X86/x86-pshufb.ll index f181ef57fe20..f181ef57fe20 100644 --- a/test/Transforms/InstCombine/x86-pshufb.ll +++ b/test/Transforms/InstCombine/X86/x86-pshufb.ll diff --git a/test/Transforms/InstCombine/x86-sse.ll b/test/Transforms/InstCombine/X86/x86-sse.ll index 6ed62a4e0224..6ed62a4e0224 100644 --- a/test/Transforms/InstCombine/x86-sse.ll +++ b/test/Transforms/InstCombine/X86/x86-sse.ll diff --git a/test/Transforms/InstCombine/x86-sse2.ll b/test/Transforms/InstCombine/X86/x86-sse2.ll index fe8828bfb5b2..fe8828bfb5b2 100644 --- a/test/Transforms/InstCombine/x86-sse2.ll +++ b/test/Transforms/InstCombine/X86/x86-sse2.ll diff --git a/test/Transforms/InstCombine/x86-sse41.ll b/test/Transforms/InstCombine/X86/x86-sse41.ll index 16975471b9e1..16975471b9e1 100644 --- a/test/Transforms/InstCombine/x86-sse41.ll +++ b/test/Transforms/InstCombine/X86/x86-sse41.ll diff --git a/test/Transforms/InstCombine/x86-sse4a.ll b/test/Transforms/InstCombine/X86/x86-sse4a.ll index e36a73532259..e36a73532259 100644 --- a/test/Transforms/InstCombine/x86-sse4a.ll +++ b/test/Transforms/InstCombine/X86/x86-sse4a.ll diff --git a/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll new file mode 100644 index 000000000000..5ad8e767d767 --- /dev/null +++ b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll @@ -0,0 +1,110 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define i16 @test1(float %f) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[TMP281:%.*]] = fadd float %f, -1.000000e+00 +; CHECK-NEXT: [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01 +; CHECK-NEXT: [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>) +; CHECK-NEXT: [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) +; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]]) +; CHECK-NEXT: [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16 +; CHECK-NEXT: ret i16 [[TMP69]] +; + %tmp = insertelement <4 x float> undef, float %f, i32 0 + %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 + %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 + %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 + %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) + %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) + %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) + %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) + %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) + %tmp69 = trunc i32 %tmp.upgrd.1 to i16 + ret i16 %tmp69 +} + +define i64 @test3(float %f, double %d) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]]) +; CHECK-NEXT: [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]]) +; CHECK-NEXT: [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]]) +; CHECK-NEXT: [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]]) +; CHECK-NEXT: [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]]) +; CHECK-NEXT: [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]]) +; CHECK-NEXT: [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]]) +; CHECK-NEXT: [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]]) +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]] +; CHECK-NEXT: ret i64 [[TMP15]] +; + %v00 = insertelement <4 x float> undef, float %f, i32 0 + %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1 + %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2 + %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3 + %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03) + %v10 = insertelement <4 x float> undef, float %f, i32 0 + %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1 + %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2 + %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3 + %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13) + %v20 = insertelement <4 x float> undef, float %f, i32 0 + %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1 + %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2 + %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3 + %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23) + %v30 = insertelement <4 x float> undef, float %f, i32 0 + %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1 + %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2 + %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3 + %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33) + %v40 = insertelement <2 x double> undef, double %d, i32 0 + %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1 + %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41) + %v50 = insertelement <2 x double> undef, double %d, i32 0 + %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1 + %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51) + %v60 = insertelement <2 x double> undef, double %d, i32 0 + %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1 + %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61) + %v70 = insertelement <2 x double> undef, double %d, i32 0 + %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1 + %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71) + %tmp8 = add i32 %tmp0, %tmp2 + %tmp9 = add i32 %tmp4, %tmp6 + %tmp10 = add i32 %tmp8, %tmp9 + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = add i64 %tmp1, %tmp3 + %tmp13 = add i64 %tmp5, %tmp7 + %tmp14 = add i64 %tmp12, %tmp13 + %tmp15 = add i64 %tmp11, %tmp14 + ret i64 %tmp15 +} + +declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) +declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) +declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) +declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) +declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) +declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) +declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) +declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) diff --git a/test/Transforms/InstCombine/x86-vector-shifts.ll b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll index 07934fbdfe72..07934fbdfe72 100644 --- a/test/Transforms/InstCombine/x86-vector-shifts.ll +++ b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll diff --git a/test/Transforms/InstCombine/x86-vperm2.ll b/test/Transforms/InstCombine/X86/x86-vperm2.ll index 84f69aa25d24..84f69aa25d24 100644 --- a/test/Transforms/InstCombine/x86-vperm2.ll +++ b/test/Transforms/InstCombine/X86/x86-vperm2.ll diff --git a/test/Transforms/InstCombine/x86-vpermil.ll b/test/Transforms/InstCombine/X86/x86-vpermil.ll index f68eb36c4b58..f68eb36c4b58 100644 --- a/test/Transforms/InstCombine/x86-vpermil.ll +++ b/test/Transforms/InstCombine/X86/x86-vpermil.ll diff --git a/test/Transforms/InstCombine/x86-xop.ll b/test/Transforms/InstCombine/X86/x86-xop.ll index 03a3f921abb2..03a3f921abb2 100644 --- a/test/Transforms/InstCombine/x86-xop.ll +++ b/test/Transforms/InstCombine/X86/x86-xop.ll diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll index 648305d134cd..5f7101e8feca 100644 --- a/test/Transforms/InstCombine/add.ll +++ b/test/Transforms/InstCombine/add.ll @@ -27,6 +27,32 @@ define <2 x i32> @select_0_or_1_from_bool_vec(<2 x i1> %x) { ret <2 x i32> %add } +; This is an 'andn' of the low bit. + +define i32 @flip_and_mask(i32 %x) { +; CHECK-LABEL: @flip_and_mask( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 %x, 1 +; CHECK-NEXT: [[INC:%.*]] = xor i32 [[TMP1]], 1 +; CHECK-NEXT: ret i32 [[INC]] +; + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %inc = add i32 %shr, 1 + ret i32 %inc +} + +define <2 x i8> @flip_and_mask_splat(<2 x i8> %x) { +; CHECK-LABEL: @flip_and_mask_splat( +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> %x, <i8 1, i8 1> +; CHECK-NEXT: [[INC:%.*]] = and <2 x i8> [[TMP1]], <i8 1, i8 1> +; CHECK-NEXT: ret <2 x i8> [[INC]] +; + %shl = shl <2 x i8> %x, <i8 7, i8 7> + %shr = ashr <2 x i8> %shl, <i8 7, i8 7> + %inc = add <2 x i8> %shr, <i8 1, i8 1> + ret <2 x i8> %inc +} + define i32 @test1(i32 %A) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: ret i32 %A diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll index 8ef7870891f0..7bb9b95b3179 100644 --- a/test/Transforms/InstCombine/and.ll +++ b/test/Transforms/InstCombine/and.ll @@ -310,7 +310,7 @@ define i8 @test27(i8 %A) { ret i8 %E } -;; This is juse a zero extending shr. +;; This is just a zero-extending shr. define i32 @test28(i32 %X) { ; CHECK-LABEL: @test28( ; CHECK-NEXT: [[Y1:%.*]] = lshr i32 %X, 24 diff --git a/test/Transforms/InstCombine/bit-tracking.ll b/test/Transforms/InstCombine/bit-tracking.ll deleted file mode 100644 index 51bbc0888836..000000000000 --- a/test/Transforms/InstCombine/bit-tracking.ll +++ /dev/null @@ -1,26 +0,0 @@ -; This file contains various testcases that require tracking whether bits are -; set or cleared by various instructions. -; RUN: opt < %s -instcombine -instcombine -S |\ -; RUN: not grep %ELIM - -; Reduce down to a single XOR -define i32 @test3(i32 %B) { - %ELIMinc = and i32 %B, 1 ; <i32> [#uses=1] - %tmp.5 = xor i32 %ELIMinc, 1 ; <i32> [#uses=1] - %ELIM7 = and i32 %B, -2 ; <i32> [#uses=1] - %tmp.8 = or i32 %tmp.5, %ELIM7 ; <i32> [#uses=1] - ret i32 %tmp.8 -} - -; Finally, a bigger case where we chain things together. This corresponds to -; incrementing a single-bit bitfield, which should become just an xor. -define i32 @test4(i32 %B) { - %ELIM3 = shl i32 %B, 31 ; <i32> [#uses=1] - %ELIM4 = ashr i32 %ELIM3, 31 ; <i32> [#uses=1] - %inc = add i32 %ELIM4, 1 ; <i32> [#uses=1] - %ELIM5 = and i32 %inc, 1 ; <i32> [#uses=1] - %ELIM7 = and i32 %B, -2 ; <i32> [#uses=1] - %tmp.8 = or i32 %ELIM5, %ELIM7 ; <i32> [#uses=1] - ret i32 %tmp.8 -} - diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll index 4621d33d4388..a4375a5cd57e 100644 --- a/test/Transforms/InstCombine/cast.ll +++ b/test/Transforms/InstCombine/cast.ll @@ -1432,3 +1432,41 @@ define <2 x i32> @test90() { %tmp6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32> ret <2 x i32> %tmp6 } + +; Do not optimize to ashr i64 (shift by 48 > 96 - 64) +define i64 @test91(i64 %A) { +; CHECK-LABEL: @test91( +; CHECK-NEXT: [[B:%.*]] = sext i64 %A to i96 +; CHECK-NEXT: [[C:%.*]] = lshr i96 [[B]], 48 +; CHECK-NEXT: [[D:%.*]] = trunc i96 [[C]] to i64 +; CHECK-NEXT: ret i64 [[D]] +; + %B = sext i64 %A to i96 + %C = lshr i96 %B, 48 + %D = trunc i96 %C to i64 + ret i64 %D +} + +; Do optimize to ashr i64 (shift by 32 <= 96 - 64) +define i64 @test92(i64 %A) { +; CHECK-LABEL: @test92( +; CHECK-NEXT: [[C:%.*]] = ashr i64 %A, 32 +; CHECK-NEXT: ret i64 [[C]] +; + %B = sext i64 %A to i96 + %C = lshr i96 %B, 32 + %D = trunc i96 %C to i64 + ret i64 %D +} + +; When optimizing to ashr i32, don't shift by more than 31. +define i32 @test93(i32 %A) { +; CHECK-LABEL: @test93( +; CHECK-NEXT: [[C:%.*]] = ashr i32 %A, 31 +; CHECK-NEXT: ret i32 [[C]] +; + %B = sext i32 %A to i96 + %C = lshr i96 %B, 64 + %D = trunc i96 %C to i32 + ret i32 %D +} diff --git a/test/Transforms/InstCombine/constant-fold-iteration.ll b/test/Transforms/InstCombine/constant-fold-iteration.ll new file mode 100644 index 000000000000..e1b692173ce8 --- /dev/null +++ b/test/Transforms/InstCombine/constant-fold-iteration.ll @@ -0,0 +1,10 @@ +; RUN: opt < %s -instcombine -S -debug 2>&1 | FileCheck %s +; REQUIRES: asserts +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" + +define i32 @a() nounwind readnone { +entry: + ret i32 zext (i1 icmp eq (i32 0, i32 ptrtoint (i32 ()* @a to i32)) to i32) +} +; CHECK: INSTCOMBINE ITERATION #1 +; CHECK-NOT: INSTCOMBINE ITERATION #2 diff --git a/test/Transforms/InstCombine/demorgan.ll b/test/Transforms/InstCombine/demorgan.ll index 26c2270a3fdf..8c3d3b830468 100644 --- a/test/Transforms/InstCombine/demorgan.ll +++ b/test/Transforms/InstCombine/demorgan.ll @@ -399,7 +399,7 @@ define i32 @demorgan_or_zext(i1 %X, i1 %Y) { ; CHECK-LABEL: @demorgan_or_zext( ; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i1 %X, %Y ; CHECK-NEXT: [[OR1:%.*]] = xor i1 [[OR1_DEMORGAN]], true -; CHECK-NEXT: [[OR:%.*]] = zext i1 [[OR:%.*]]1 to i32 +; CHECK-NEXT: [[OR:%.*]] = zext i1 [[OR1]] to i32 ; CHECK-NEXT: ret i32 [[OR]] ; %zextX = zext i1 %X to i32 @@ -414,7 +414,7 @@ define i32 @demorgan_and_zext(i1 %X, i1 %Y) { ; CHECK-LABEL: @demorgan_and_zext( ; CHECK-NEXT: [[AND1_DEMORGAN:%.*]] = or i1 %X, %Y ; CHECK-NEXT: [[AND1:%.*]] = xor i1 [[AND1_DEMORGAN]], true -; CHECK-NEXT: [[AND:%.*]] = zext i1 [[AND:%.*]]1 to i32 +; CHECK-NEXT: [[AND:%.*]] = zext i1 [[AND1]] to i32 ; CHECK-NEXT: ret i32 [[AND]] ; %zextX = zext i1 %X to i32 @@ -429,7 +429,7 @@ define <2 x i32> @demorgan_or_zext_vec(<2 x i1> %X, <2 x i1> %Y) { ; CHECK-LABEL: @demorgan_or_zext_vec( ; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and <2 x i1> %X, %Y ; CHECK-NEXT: [[OR1:%.*]] = xor <2 x i1> [[OR1_DEMORGAN]], <i1 true, i1 true> -; CHECK-NEXT: [[OR:%.*]] = zext <2 x i1> [[OR:%.*]]1 to <2 x i32> +; CHECK-NEXT: [[OR:%.*]] = zext <2 x i1> [[OR1]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[OR]] ; %zextX = zext <2 x i1> %X to <2 x i32> @@ -444,7 +444,7 @@ define <2 x i32> @demorgan_and_zext_vec(<2 x i1> %X, <2 x i1> %Y) { ; CHECK-LABEL: @demorgan_and_zext_vec( ; CHECK-NEXT: [[AND1_DEMORGAN:%.*]] = or <2 x i1> %X, %Y ; CHECK-NEXT: [[AND1:%.*]] = xor <2 x i1> [[AND1_DEMORGAN]], <i1 true, i1 true> -; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[AND:%.*]]1 to <2 x i32> +; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[AND1]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[AND]] ; %zextX = zext <2 x i1> %X to <2 x i32> diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index edfa9a102917..6f657b190454 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -695,6 +695,21 @@ define i1 @test48(i32 %X, i32 %Y, i32 %Z) { ret i1 %C } +; The above transform only works for equality predicates. + +define i1 @PR32949(i32 %X, i32 %Y, i32 %Z) { +; CHECK-LABEL: @PR32949( +; CHECK-NEXT: [[A:%.*]] = sdiv exact i32 %X, %Z +; CHECK-NEXT: [[B:%.*]] = sdiv exact i32 %Y, %Z +; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[A]], [[B]] +; CHECK-NEXT: ret i1 [[C]] +; + %A = sdiv exact i32 %X, %Z + %B = sdiv exact i32 %Y, %Z + %C = icmp sgt i32 %A, %B + ret i1 %C +} + ; PR8469 define <2 x i1> @test49(<2 x i32> %tmp3) { ; CHECK-LABEL: @test49( diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll index 66ab7f48aeff..5654b265da58 100644 --- a/test/Transforms/InstCombine/intrinsics.ll +++ b/test/Transforms/InstCombine/intrinsics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -instcombine -S < %s | FileCheck %s %overflow.result = type {i8, i1} @@ -283,14 +284,24 @@ define i32 @cttz(i32 %a) { define i1 @cttz_knownbits(i32 %arg) { ; CHECK-LABEL: @cttz_knownbits( +; CHECK-NEXT: ret i1 false +; + %or = or i32 %arg, 4 + %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone + %res = icmp eq i32 %cnt, 4 + ret i1 %res +} + +define i1 @cttz_knownbits2(i32 %arg) { +; CHECK-LABEL: @cttz_knownbits2( ; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG:%.*]], 4 ; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) -; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 4 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 2 ; CHECK-NEXT: ret i1 [[RES]] ; %or = or i32 %arg, 4 %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone - %res = icmp eq i32 %cnt, 4 + %res = icmp eq i32 %cnt, 2 ret i1 %res } @@ -306,14 +317,24 @@ define i8 @ctlz(i8 %a) { define i1 @ctlz_knownbits(i8 %arg) { ; CHECK-LABEL: @ctlz_knownbits( +; CHECK-NEXT: ret i1 false +; + %or = or i8 %arg, 32 + %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone + %res = icmp eq i8 %cnt, 4 + ret i1 %res +} + +define i1 @ctlz_knownbits2(i8 %arg) { +; CHECK-LABEL: @ctlz_knownbits2( ; CHECK-NEXT: [[OR:%.*]] = or i8 [[ARG:%.*]], 32 ; CHECK-NEXT: [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) -; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 4 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 2 ; CHECK-NEXT: ret i1 [[RES]] ; %or = or i8 %arg, 32 %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone - %res = icmp eq i8 %cnt, 4 + %res = icmp eq i8 %cnt, 2 ret i1 %res } diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll index 3ab40c4de92d..7f0bd23eb8a5 100644 --- a/test/Transforms/InstCombine/logical-select.ll +++ b/test/Transforms/InstCombine/logical-select.ll @@ -62,6 +62,81 @@ define i32 @poo(i32 %a, i32 %b, i32 %c, i32 %d) { ret i32 %t3 } +; TODO: For the next 4 tests, are there potential canonicalizations and/or folds for these +; in InstCombine? Independent of that, tests like this that may not show any transforms +; still have value because they can help identify conflicting canonicalization rules that +; lead to infinite looping. + +; PR32791 - https://bugs.llvm.org//show_bug.cgi?id=32791 +; Fold two selects with inverted predicates and zero operands. +define i32 @fold_inverted_icmp_preds(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: @fold_inverted_icmp_preds( +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 %a, %b +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 %a, %b +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp1 = icmp slt i32 %a, %b + %sel1 = select i1 %cmp1, i32 %c, i32 0 + %cmp2 = icmp sge i32 %a, %b + %sel2 = select i1 %cmp2, i32 %d, i32 0 + %or = or i32 %sel1, %sel2 + ret i32 %or +} + +define i32 @fold_inverted_icmp_preds_reverse(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: @fold_inverted_icmp_preds_reverse( +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 %a, %b +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 0, i32 %c +; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 %a, %b +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 %d +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp1 = icmp slt i32 %a, %b + %sel1 = select i1 %cmp1, i32 0, i32 %c + %cmp2 = icmp sge i32 %a, %b + %sel2 = select i1 %cmp2, i32 0, i32 %d + %or = or i32 %sel1, %sel2 + ret i32 %or +} + +define i32 @fold_inverted_fcmp_preds(float %a, float %b, i32 %c, i32 %d) { +; CHECK-LABEL: @fold_inverted_fcmp_preds( +; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float %a, %b +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP1]], i32 %c, i32 0 +; CHECK-NEXT: [[CMP2:%.*]] = fcmp uge float %a, %b +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 %d, i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL1]], [[SEL2]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp1 = fcmp olt float %a, %b + %sel1 = select i1 %cmp1, i32 %c, i32 0 + %cmp2 = fcmp uge float %a, %b + %sel2 = select i1 %cmp2, i32 %d, i32 0 + %or = or i32 %sel1, %sel2 + ret i32 %or +} + +define <2 x i32> @fold_inverted_icmp_vector_preds(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { +; CHECK-LABEL: @fold_inverted_icmp_vector_preds( +; CHECK-NEXT: [[CMP1:%.*]] = icmp ne <2 x i32> %a, %b +; CHECK-NEXT: [[SEL1:%.*]] = select <2 x i1> [[CMP1]], <2 x i32> %c, <2 x i32> zeroinitializer +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i32> %a, %b +; CHECK-NEXT: [[SEL2:%.*]] = select <2 x i1> [[CMP2]], <2 x i32> %d, <2 x i32> zeroinitializer +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SEL1]], [[SEL2]] +; CHECK-NEXT: ret <2 x i32> [[OR]] +; + %cmp1 = icmp ne <2 x i32> %a, %b + %sel1 = select <2 x i1> %cmp1, <2 x i32> %c, <2 x i32> <i32 0, i32 0> + %cmp2 = icmp eq <2 x i32> %a, %b + %sel2 = select <2 x i1> %cmp2, <2 x i32> %d, <2 x i32> <i32 0, i32 0> + %or = or <2 x i32> %sel1, %sel2 + ret <2 x i32> %or +} + define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @par( ; CHECK-NEXT: [[T0:%.*]] = icmp slt i32 %a, %b diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll index 2760d4ae044d..6ff0a50318d2 100644 --- a/test/Transforms/InstCombine/not.ll +++ b/test/Transforms/InstCombine/not.ll @@ -11,8 +11,8 @@ define i32 @test1(i32 %A) { define i1 @invert_icmp(i32 %A, i32 %B) { ; CHECK-LABEL: @invert_icmp( -; CHECK-NEXT: [[NOT:%.*]] = icmp sgt i32 %A, %B -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 %A, %B +; CHECK-NEXT: ret i1 [[CMP]] ; %cmp = icmp sle i32 %A, %B %not = xor i1 %cmp, true @@ -23,8 +23,8 @@ define i1 @invert_icmp(i32 %A, i32 %B) { define i1 @invert_fcmp(float %X, float %Y) { ; CHECK-LABEL: @invert_fcmp( -; CHECK-NEXT: [[NOT:%.*]] = fcmp uge float %X, %Y -; CHECK-NEXT: ret i1 [[NOT]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp uge float %X, %Y +; CHECK-NEXT: ret i1 [[CMP]] ; %cmp = fcmp olt float %X, %Y %not = xor i1 %cmp, true @@ -48,11 +48,75 @@ define zeroext i8 @test6(i32 %a, i32 %b) { define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) { ; CHECK-LABEL: @test7( -; CHECK-NEXT: [[RET:%.*]] = icmp sgt <2 x i32> %A, %B -; CHECK-NEXT: ret <2 x i1> [[RET]] +; CHECK-NEXT: [[COND:%.*]] = icmp sgt <2 x i32> %A, %B +; CHECK-NEXT: ret <2 x i1> [[COND]] ; %cond = icmp sle <2 x i32> %A, %B %Ret = xor <2 x i1> %cond, <i1 true, i1 true> ret <2 x i1> %Ret } +define i32 @not_ashr_not(i32 %A, i32 %B) { +; CHECK-LABEL: @not_ashr_not( +; CHECK-NEXT: [[NOT2:%.*]] = ashr i32 %A, %B +; CHECK-NEXT: ret i32 [[NOT2]] +; + %not1 = xor i32 %A, -1 + %ashr = ashr i32 %not1, %B + %not2 = xor i32 %ashr, -1 + ret i32 %not2 +} + +define i8 @not_ashr_const(i8 %x) { +; CHECK-LABEL: @not_ashr_const( +; CHECK-NEXT: [[NOT:%.*]] = lshr i8 41, %x +; CHECK-NEXT: ret i8 [[NOT]] +; + %shr = ashr i8 -42, %x + %not = xor i8 %shr, -1 + ret i8 %not +} + +define <2 x i8> @not_ashr_const_splat(<2 x i8> %x) { +; CHECK-LABEL: @not_ashr_const_splat( +; CHECK-NEXT: [[NOT:%.*]] = lshr <2 x i8> <i8 41, i8 41>, %x +; CHECK-NEXT: ret <2 x i8> [[NOT]] +; + %shr = ashr <2 x i8> <i8 -42, i8 -42>, %x + %not = xor <2 x i8> %shr, <i8 -1, i8 -1> + ret <2 x i8> %not +} + +; We can't get rid of the 'not' on a logical shift of a negative constant. + +define i8 @not_lshr_const_negative(i8 %x) { +; CHECK-LABEL: @not_lshr_const_negative( +; CHECK-NEXT: [[SHR:%.*]] = lshr i8 -42, %x +; CHECK-NEXT: [[NOT:%.*]] = xor i8 [[SHR]], -1 +; CHECK-NEXT: ret i8 [[NOT]] +; + %shr = lshr i8 -42, %x + %not = xor i8 %shr, -1 + ret i8 %not +} + +define i8 @not_lshr_const(i8 %x) { +; CHECK-LABEL: @not_lshr_const( +; CHECK-NEXT: [[NOT:%.*]] = ashr i8 -43, %x +; CHECK-NEXT: ret i8 [[NOT]] +; + %shr = lshr i8 42, %x + %not = xor i8 %shr, -1 + ret i8 %not +} + +define <2 x i8> @not_lshr_const_splat(<2 x i8> %x) { +; CHECK-LABEL: @not_lshr_const_splat( +; CHECK-NEXT: [[NOT:%.*]] = ashr <2 x i8> <i8 -43, i8 -43>, %x +; CHECK-NEXT: ret <2 x i8> [[NOT]] +; + %shr = lshr <2 x i8> <i8 42, i8 42>, %x + %not = xor <2 x i8> %shr, <i8 -1, i8 -1> + ret <2 x i8> %not +} + diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll index ec5b71656a47..f2bc290d79a4 100644 --- a/test/Transforms/InstCombine/or-xor.ll +++ b/test/Transforms/InstCombine/or-xor.ll @@ -230,3 +230,73 @@ define i32 @test16(i32 %a, i32 %b) { %xor = or i32 %and1, %and2 ret i32 %xor } + +define i8 @not_or(i8 %x) { +; CHECK-LABEL: @not_or( +; CHECK-NEXT: [[NOTX:%.*]] = or i8 %x, 7 +; CHECK-NEXT: [[OR:%.*]] = xor i8 [[NOTX]], -8 +; CHECK-NEXT: ret i8 [[OR]] +; + %notx = xor i8 %x, -1 + %or = or i8 %notx, 7 + ret i8 %or +} + +define i8 @not_or_xor(i8 %x) { +; CHECK-LABEL: @not_or_xor( +; CHECK-NEXT: [[NOTX:%.*]] = or i8 %x, 7 +; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[NOTX]], -12 +; CHECK-NEXT: ret i8 [[XOR]] +; + %notx = xor i8 %x, -1 + %or = or i8 %notx, 7 + %xor = xor i8 %or, 12 + ret i8 %xor +} + +define i8 @xor_or(i8 %x) { +; CHECK-LABEL: @xor_or( +; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 7 +; CHECK-NEXT: [[OR:%.*]] = xor i8 [[XOR]], 32 +; CHECK-NEXT: ret i8 [[OR]] +; + %xor = xor i8 %x, 32 + %or = or i8 %xor, 7 + ret i8 %or +} + +define i8 @xor_or2(i8 %x) { +; CHECK-LABEL: @xor_or2( +; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 7 +; CHECK-NEXT: [[OR:%.*]] = xor i8 [[XOR]], 32 +; CHECK-NEXT: ret i8 [[OR]] +; + %xor = xor i8 %x, 33 + %or = or i8 %xor, 7 + ret i8 %or +} + +define i8 @xor_or_xor(i8 %x) { +; CHECK-LABEL: @xor_or_xor( +; CHECK-NEXT: [[XOR1:%.*]] = or i8 %x, 7 +; CHECK-NEXT: [[XOR2:%.*]] = xor i8 [[XOR1]], 44 +; CHECK-NEXT: ret i8 [[XOR2]] +; + %xor1 = xor i8 %x, 33 + %or = or i8 %xor1, 7 + %xor2 = xor i8 %or, 12 + ret i8 %xor2 +} + +define i8 @or_xor_or(i8 %x) { +; CHECK-LABEL: @or_xor_or( +; CHECK-NEXT: [[XOR:%.*]] = or i8 %x, 39 +; CHECK-NEXT: [[OR2:%.*]] = xor i8 [[XOR]], 8 +; CHECK-NEXT: ret i8 [[OR2]] +; + %or1 = or i8 %x, 33 + %xor = xor i8 %or1, 12 + %or2 = or i8 %xor, 7 + ret i8 %or2 +} + diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll index bfafd66ebb41..764fe4503b5e 100644 --- a/test/Transforms/InstCombine/or.ll +++ b/test/Transforms/InstCombine/or.ll @@ -3,115 +3,6 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" -define i32 @test1(i32 %A) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: ret i32 %A -; - %B = or i32 %A, 0 - ret i32 %B -} - -define i32 @test2(i32 %A) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: ret i32 -1 -; - %B = or i32 %A, -1 - ret i32 %B -} - -define i8 @test2a(i8 %A) { -; CHECK-LABEL: @test2a( -; CHECK-NEXT: ret i8 -1 -; - %B = or i8 %A, -1 - ret i8 %B -} - -define i1 @test3(i1 %A) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: ret i1 %A -; - %B = or i1 %A, false - ret i1 %B -} - -define i1 @test4(i1 %A) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: ret i1 true -; - %B = or i1 %A, true - ret i1 %B -} - -define i1 @test5(i1 %A) { -; CHECK-LABEL: @test5( -; CHECK-NEXT: ret i1 %A -; - %B = or i1 %A, %A - ret i1 %B -} - -define i32 @test6(i32 %A) { -; CHECK-LABEL: @test6( -; CHECK-NEXT: ret i32 %A -; - %B = or i32 %A, %A - ret i32 %B -} - -; A | ~A == -1 -define i32 @test7(i32 %A) { -; CHECK-LABEL: @test7( -; CHECK-NEXT: ret i32 -1 -; - %NotA = xor i32 -1, %A - %B = or i32 %A, %NotA - ret i32 %B -} - -define i8 @test8(i8 %A) { -; CHECK-LABEL: @test8( -; CHECK-NEXT: ret i8 -1 -; - %B = or i8 %A, -2 - %C = or i8 %B, 1 - ret i8 %C -} - -; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2) -define i8 @test9(i8 %A, i8 %B) { -; CHECK-LABEL: @test9( -; CHECK-NEXT: ret i8 -1 -; - %C = or i8 %A, 1 - %D = or i8 %B, -2 - %E = or i8 %C, %D - ret i8 %E -} - -define i8 @test10(i8 %A) { -; CHECK-LABEL: @test10( -; CHECK-NEXT: ret i8 -2 -; - %B = or i8 %A, 1 - %C = and i8 %B, -2 - ; (X & C1) | C2 --> (X | C2) & (C1|C2) - %D = or i8 %C, -2 - ret i8 %D -} - -define i8 @test11(i8 %A) { -; CHECK-LABEL: @test11( -; CHECK-NEXT: ret i8 -1 -; - %B = or i8 %A, -2 - %C = xor i8 %B, 13 - ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) - %D = or i8 %C, 1 - %E = xor i8 %D, 12 - ret i8 %E -} - define i32 @test12(i32 %A) { ; Should be eliminated ; CHECK-LABEL: @test12( diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll index 4cdd080fb0e0..46406ac2f788 100644 --- a/test/Transforms/InstCombine/sext.ll +++ b/test/Transforms/InstCombine/sext.ll @@ -128,7 +128,7 @@ F: define i32 @test10(i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[B1:%.*]] = shl i32 %i, 30 -; CHECK-NEXT: [[B:%.*]] = ashr exact i32 [[B:%.*]]1, 30 +; CHECK-NEXT: [[B:%.*]] = ashr exact i32 [[B1]], 30 ; CHECK-NEXT: ret i32 [[B]] ; %tmp12 = trunc i32 %i to i8 diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll index 5597b578f017..dd86e5a907b8 100644 --- a/test/Transforms/InstCombine/trunc.ll +++ b/test/Transforms/InstCombine/trunc.ll @@ -24,7 +24,7 @@ define i64 @test2(i64 %a) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32 ; CHECK-NEXT: [[D1:%.*]] = shl i64 %a, 36 -; CHECK-NEXT: [[D:%.*]] = ashr exact i64 [[D:%.*]]1, 36 +; CHECK-NEXT: [[D:%.*]] = ashr exact i64 [[D1]], 36 ; CHECK-NEXT: call void @use(i32 [[B]]) ; CHECK-NEXT: ret i64 [[D]] ; diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 5f27634da19c..00efbe00b08d 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -2,30 +2,6 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define i16 @test1(float %f) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[TMP281:%.*]] = fadd float %f, -1.000000e+00 -; CHECK-NEXT: [[TMP373:%.*]] = fmul float [[TMP281]], 5.000000e-01 -; CHECK-NEXT: [[TMP374:%.*]] = insertelement <4 x float> undef, float [[TMP373]], i32 0 -; CHECK-NEXT: [[TMP48:%.*]] = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> [[TMP374]], <4 x float> <float 6.553500e+04, float undef, float undef, float undef>) -; CHECK-NEXT: [[TMP59:%.*]] = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> [[TMP48]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) -; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[TMP59]]) -; CHECK-NEXT: [[TMP69:%.*]] = trunc i32 [[TMP_UPGRD_1]] to i16 -; CHECK-NEXT: ret i16 [[TMP69]] -; - %tmp = insertelement <4 x float> undef, float %f, i32 0 - %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 - %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 - %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 - %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) - %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) - %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) - %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) - %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) - %tmp69 = trunc i32 %tmp.upgrd.1 to i16 - ret i16 %tmp69 -} - define i32 @test2(float %f) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[TMP5:%.*]] = fmul float %f, %f @@ -42,77 +18,6 @@ define i32 @test2(float %f) { ret i32 %tmp21 } -define i64 @test3(float %f, double %d) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: [[V00:%.*]] = insertelement <4 x float> undef, float %f, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[V00]]) -; CHECK-NEXT: [[V10:%.*]] = insertelement <4 x float> undef, float %f, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> [[V10]]) -; CHECK-NEXT: [[V20:%.*]] = insertelement <4 x float> undef, float %f, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[V20]]) -; CHECK-NEXT: [[V30:%.*]] = insertelement <4 x float> undef, float %f, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> [[V30]]) -; CHECK-NEXT: [[V40:%.*]] = insertelement <2 x double> undef, double %d, i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[V40]]) -; CHECK-NEXT: [[V50:%.*]] = insertelement <2 x double> undef, double %d, i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> [[V50]]) -; CHECK-NEXT: [[V60:%.*]] = insertelement <2 x double> undef, double %d, i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[V60]]) -; CHECK-NEXT: [[V70:%.*]] = insertelement <2 x double> undef, double %d, i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> [[V70]]) -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]] -; CHECK-NEXT: ret i64 [[TMP15]] -; - %v00 = insertelement <4 x float> undef, float %f, i32 0 - %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1 - %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2 - %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3 - %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03) - %v10 = insertelement <4 x float> undef, float %f, i32 0 - %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1 - %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2 - %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3 - %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13) - %v20 = insertelement <4 x float> undef, float %f, i32 0 - %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1 - %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2 - %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3 - %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23) - %v30 = insertelement <4 x float> undef, float %f, i32 0 - %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1 - %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2 - %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3 - %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33) - %v40 = insertelement <2 x double> undef, double %d, i32 0 - %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1 - %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41) - %v50 = insertelement <2 x double> undef, double %d, i32 0 - %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1 - %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51) - %v60 = insertelement <2 x double> undef, double %d, i32 0 - %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1 - %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61) - %v70 = insertelement <2 x double> undef, double %d, i32 0 - %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1 - %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71) - %tmp8 = add i32 %tmp0, %tmp2 - %tmp9 = add i32 %tmp4, %tmp6 - %tmp10 = add i32 %tmp8, %tmp9 - %tmp11 = sext i32 %tmp10 to i64 - %tmp12 = add i64 %tmp1, %tmp3 - %tmp13 = add i64 %tmp5, %tmp7 - %tmp14 = add i64 %tmp12, %tmp13 - %tmp15 = add i64 %tmp11, %tmp14 - ret i64 %tmp15 -} - define void @get_image() nounwind { ; CHECK-LABEL: @get_image( ; CHECK-NEXT: entry: @@ -156,18 +61,6 @@ entry: } declare i32 @fgetc(i8*) -declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) -declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) -declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) -declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) -declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) -declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) -declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) -declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) -declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) -declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) -declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) -declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind { ; CHECK-LABEL: @dead_shuffle_elt( @@ -248,4 +141,3 @@ define <2 x i64> @PR24922(<2 x i64> %v) { %result = select <2 x i1> <i1 icmp eq (i64 extractelement (<2 x i64> bitcast (<4 x i32> <i32 15, i32 15, i32 15, i32 15> to <2 x i64>), i64 0), i64 0), i1 true>, <2 x i64> %v, <2 x i64> zeroinitializer ret <2 x i64> %result } - diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll index f817ac5ca40c..3afbf632f6e1 100644 --- a/test/Transforms/InstCombine/xor2.ll +++ b/test/Transforms/InstCombine/xor2.ll @@ -57,17 +57,6 @@ define i32 @test3(i32 %tmp1) { ret i32 %ov110 } -define i32 @test4(i32 %A, i32 %B) { -; CHECK-LABEL: @test4( -; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 %A, %B -; CHECK-NEXT: ret i32 [[TMP1]] -; - %1 = xor i32 %A, -1 - %2 = ashr i32 %1, %B - %3 = xor i32 %2, -1 - ret i32 %3 -} - ; defect-2 in rdar://12329730 ; (X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3) ; where the "X" has more than one use diff --git a/test/Transforms/InstNamer/basic.ll b/test/Transforms/InstNamer/basic.ll new file mode 100644 index 000000000000..4c819246b90b --- /dev/null +++ b/test/Transforms/InstNamer/basic.ll @@ -0,0 +1,19 @@ +; RUN: opt -S -instnamer < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @f_0(i32) { +; CHECK-LABEL: @f_0( +; CHECK: bb: +; CHECK-NEXT: %tmp = add i32 %arg, 2 +; CHECK-NEXT: br label %bb1 +; CHECK: bb1: +; CHECK-NEXT: ret i32 %tmp + + %2 = add i32 %0, 2 + br label %3 + +; <label>:3: + ret i32 %2 +} diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll index e059d77f1fa8..427ea655fcb2 100644 --- a/test/Transforms/InstSimplify/AndOrXor.ll +++ b/test/Transforms/InstSimplify/AndOrXor.ll @@ -628,3 +628,176 @@ define i32 @test46_commuted_and(i32 %a, i32 %b) { %or = or i32 %xor, %and ret i32 %or } + +; (~A ^ B) | (A & B) -> ~A ^ B + +define i32 @test47(i32 %a, i32 %b) { +; CHECK-LABEL: @test47( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %a, %b + %xor = xor i32 %nega, %b + %or = or i32 %xor, %and + ret i32 %or +} + +define i32 @test48(i32 %a, i32 %b) { +; CHECK-LABEL: @test48( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %a, %b + %xor = xor i32 %b, %nega + %or = or i32 %xor, %and + ret i32 %or +} + +define i32 @test49(i32 %a, i32 %b) { +; CHECK-LABEL: @test49( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %b, %a + %xor = xor i32 %b, %nega + %or = or i32 %xor, %and + ret i32 %or +} + +define i32 @test50(i32 %a, i32 %b) { +; CHECK-LABEL: @test50( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %b, %a + %xor = xor i32 %nega, %b + %or = or i32 %xor, %and + ret i32 %or +} + +define i32 @test51(i32 %a, i32 %b) { +; CHECK-LABEL: @test51( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %a, %b + %xor = xor i32 %nega, %b + %or = or i32 %and, %xor + ret i32 %or +} + +define i32 @test52(i32 %a, i32 %b) { +; CHECK-LABEL: @test52( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %a, %b + %xor = xor i32 %b, %nega + %or = or i32 %and, %xor + ret i32 %or +} + +define i32 @test53(i32 %a, i32 %b) { +; CHECK-LABEL: @test53( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[B:%.*]], [[NEGA]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %b, %a + %xor = xor i32 %b, %nega + %or = or i32 %and, %xor + ret i32 %or +} + +define i32 @test54(i32 %a, i32 %b) { +; CHECK-LABEL: @test54( +; CHECK-NEXT: [[NEGA:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[NEGA]], [[B:%.*]] +; CHECK-NEXT: ret i32 [[XOR]] +; + %nega = xor i32 %a, -1 + %and = and i32 %b, %a + %xor = xor i32 %nega, %b + %or = or i32 %and, %xor + ret i32 %or +} + +define i8 @lshr_perfect_mask(i8 %x) { +; CHECK-LABEL: @lshr_perfect_mask( +; CHECK-NEXT: [[SH:%.*]] = lshr i8 %x, 5 +; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], 7 +; CHECK-NEXT: ret i8 [[MASK]] +; + %sh = lshr i8 %x, 5 + %mask = and i8 %sh, 7 ; 0x07 + ret i8 %mask +} + +define <2 x i8> @lshr_oversized_mask_splat(<2 x i8> %x) { +; CHECK-LABEL: @lshr_oversized_mask_splat( +; CHECK-NEXT: [[SH:%.*]] = lshr <2 x i8> %x, <i8 5, i8 5> +; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -121, i8 -121> +; CHECK-NEXT: ret <2 x i8> [[MASK]] +; + %sh = lshr <2 x i8> %x, <i8 5, i8 5> + %mask = and <2 x i8> %sh, <i8 135, i8 135> ; 0x87 + ret <2 x i8> %mask +} + +define i8 @lshr_undersized_mask(i8 %x) { +; CHECK-LABEL: @lshr_undersized_mask( +; CHECK-NEXT: [[SH:%.*]] = lshr i8 %x, 5 +; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], -2 +; CHECK-NEXT: ret i8 [[MASK]] +; + %sh = lshr i8 %x, 5 + %mask = and i8 %sh, -2 ; 0xFE + ret i8 %mask +} + +define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) { +; CHECK-LABEL: @shl_perfect_mask_splat( +; CHECK-NEXT: [[SH:%.*]] = shl <2 x i8> %x, <i8 6, i8 6> +; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -64, i8 -64> +; CHECK-NEXT: ret <2 x i8> [[MASK]] +; + %sh = shl <2 x i8> %x, <i8 6, i8 6> + %mask = and <2 x i8> %sh, <i8 192, i8 192> ; 0xC0 + ret <2 x i8> %mask +} + +define i8 @shl_oversized_mask(i8 %x) { +; CHECK-LABEL: @shl_oversized_mask( +; CHECK-NEXT: [[SH:%.*]] = shl i8 %x, 6 +; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], -61 +; CHECK-NEXT: ret i8 [[MASK]] +; + %sh = shl i8 %x, 6 + %mask = and i8 %sh, 195 ; 0xC3 + ret i8 %mask +} + +define <2 x i8> @shl_undersized_mask_splat(<2 x i8> %x) { +; CHECK-LABEL: @shl_undersized_mask_splat( +; CHECK-NEXT: [[SH:%.*]] = shl <2 x i8> [[X:%.*]], <i8 6, i8 6> +; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -120, i8 -120> +; CHECK-NEXT: ret <2 x i8> [[MASK]] +; + %sh = shl <2 x i8> %x, <i8 6, i8 6> + %mask = and <2 x i8> %sh, <i8 136, i8 136> ; 0x88 + ret <2 x i8> %mask +} + diff --git a/test/Transforms/InstSimplify/apint-or.ll b/test/Transforms/InstSimplify/apint-or.ll deleted file mode 100644 index e3dc2c48fb40..000000000000 --- a/test/Transforms/InstSimplify/apint-or.ll +++ /dev/null @@ -1,72 +0,0 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py -; RUN: opt < %s -instsimplify -S | FileCheck %s - -; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0. -define i39 @test1(i39 %V, i39 %M) { -; CHECK-LABEL: @test1( -; CHECK: [[N:%.*]] = and i39 %M, -274877906944 -; CHECK-NEXT: [[A:%.*]] = add i39 %V, [[N]] -; CHECK-NEXT: ret i39 [[A]] -; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. - %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943 - %N = and i39 %M, 274877906944 - %A = add i39 %V, %N - %B = and i39 %A, %C1 - %D = and i39 %V, 274877906943 - %R = or i39 %B, %D - ret i39 %R -} - -define i7 @test2(i7 %X) { -; CHECK-LABEL: @test2( -; CHECK: ret i7 %X -; - %Y = or i7 %X, 0 - ret i7 %Y -} - -define i17 @test3(i17 %X) { -; CHECK-LABEL: @test3( -; CHECK: ret i17 -1 -; - %Y = or i17 %X, -1 - ret i17 %Y -} - -; Test the case where Integer BitWidth > 64 && BitWidth <= 1024. -define i399 @test4(i399 %V, i399 %M) { -; CHECK-LABEL: @test4( -; CHECK: [[N:%.*]] = and i399 %M, 18446742974197923840 -; CHECK-NEXT: [[A:%.*]] = add i399 %V, [[N]] -; CHECK-NEXT: ret i399 [[A]] -; - ;; If we have: ((V + N) & C1) | (V & C2) - ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 - ;; replace with V+N. - %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943 - %N = and i399 %M, 18446742974197923840 - %A = add i399 %V, %N - %B = and i399 %A, %C1 - %D = and i399 %V, 274877906943 - %R = or i399 %B, %D - ret i399 %R -} - -define i777 @test5(i777 %X) { -; CHECK-LABEL: @test5( -; CHECK: ret i777 %X -; - %Y = or i777 %X, 0 - ret i777 %Y -} - -define i117 @test6(i117 %X) { -; CHECK-LABEL: @test6( -; CHECK: ret i117 -1 -; - %Y = or i117 %X, -1 - ret i117 %Y -} diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll index 883bf31ff77a..d6f1b634102f 100644 --- a/test/Transforms/InstSimplify/compare.ll +++ b/test/Transforms/InstSimplify/compare.ll @@ -598,11 +598,14 @@ define i1 @sdiv_exact_equality(i32 %Z) { ret i1 %C } -; FIXME: But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949 +; But not other preds: PR32949 - https://bugs.llvm.org/show_bug.cgi?id=32949 define i1 @sdiv_exact_not_equality(i32 %Z) { ; CHECK-LABEL: @sdiv_exact_not_equality( -; CHECK-NEXT: ret i1 true +; CHECK-NEXT: [[A:%.*]] = sdiv exact i32 10, %Z +; CHECK-NEXT: [[B:%.*]] = sdiv exact i32 20, %Z +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[A]], [[B]] +; CHECK-NEXT: ret i1 [[C]] ; %A = sdiv exact i32 10, %Z %B = sdiv exact i32 20, %Z diff --git a/test/Transforms/InstSimplify/or.ll b/test/Transforms/InstSimplify/or.ll new file mode 100644 index 000000000000..2c5b6181bc6c --- /dev/null +++ b/test/Transforms/InstSimplify/or.ll @@ -0,0 +1,181 @@ +; RUN: opt < %s -instsimplify -S | FileCheck %s + +define i32 @test1(i32 %A) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: ret i32 %A +; + %B = or i32 %A, 0 + ret i32 %B +} + +define i32 @test2(i32 %A) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: ret i32 -1 +; + %B = or i32 %A, -1 + ret i32 %B +} + +define i8 @test2a(i8 %A) { +; CHECK-LABEL: @test2a( +; CHECK-NEXT: ret i8 -1 +; + %B = or i8 %A, -1 + ret i8 %B +} + +define i1 @test3(i1 %A) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: ret i1 %A +; + %B = or i1 %A, false + ret i1 %B +} + +define i1 @test4(i1 %A) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: ret i1 true +; + %B = or i1 %A, true + ret i1 %B +} + +define i1 @test5(i1 %A) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: ret i1 %A +; + %B = or i1 %A, %A + ret i1 %B +} + +define i32 @test6(i32 %A) { +; CHECK-LABEL: @test6( +; CHECK-NEXT: ret i32 %A +; + %B = or i32 %A, %A + ret i32 %B +} + +; A | ~A == -1 +define i32 @test7(i32 %A) { +; CHECK-LABEL: @test7( +; CHECK-NEXT: ret i32 -1 +; + %NotA = xor i32 %A, -1 + %B = or i32 %A, %NotA + ret i32 %B +} + +define i8 @test8(i8 %A) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: ret i8 -1 +; + %B = or i8 %A, -2 + %C = or i8 %B, 1 + ret i8 %C +} + +; Test that (A|c1)|(B|c2) == (A|B)|(c1|c2) +define i8 @test9(i8 %A, i8 %B) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: ret i8 -1 +; + %C = or i8 %A, 1 + %D = or i8 %B, -2 + %E = or i8 %C, %D + ret i8 %E +} + +define i8 @test10(i8 %A) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: ret i8 -2 +; + %B = or i8 %A, 1 + %C = and i8 %B, -2 + ; (X & C1) | C2 --> (X | C2) & (C1|C2) + %D = or i8 %C, -2 + ret i8 %D +} + +define i8 @test11(i8 %A) { +; CHECK-LABEL: @test11( +; CHECK-NEXT: ret i8 -1 +; + %B = or i8 %A, -2 + %C = xor i8 %B, 13 + ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) + %D = or i8 %C, 1 + %E = xor i8 %D, 12 + ret i8 %E +} + +; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0. +define i39 @test1_apint(i39 %V, i39 %M) { +; CHECK-LABEL: @test1_apint( +; CHECK: [[N:%.*]] = and i39 %M, -274877906944 +; CHECK-NEXT: [[A:%.*]] = add i39 %V, [[N]] +; CHECK-NEXT: ret i39 [[A]] +; + ;; If we have: ((V + N) & C1) | (V & C2) + ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 + ;; replace with V+N. + %C1 = xor i39 274877906943, -1 ;; C2 = 274877906943 + %N = and i39 %M, 274877906944 + %A = add i39 %V, %N + %B = and i39 %A, %C1 + %D = and i39 %V, 274877906943 + %R = or i39 %B, %D + ret i39 %R +} + +define i7 @test2_apint(i7 %X) { +; CHECK-LABEL: @test2_apint( +; CHECK: ret i7 %X +; + %Y = or i7 %X, 0 + ret i7 %Y +} + +define i17 @test3_apint(i17 %X) { +; CHECK-LABEL: @test3_apint( +; CHECK: ret i17 -1 +; + %Y = or i17 %X, -1 + ret i17 %Y +} + +; Test the case where Integer BitWidth > 64 && BitWidth <= 1024. +define i399 @test4_apint(i399 %V, i399 %M) { +; CHECK-LABEL: @test4_apint( +; CHECK: [[N:%.*]] = and i399 %M, 18446742974197923840 +; CHECK-NEXT: [[A:%.*]] = add i399 %V, [[N]] +; CHECK-NEXT: ret i399 [[A]] +; + ;; If we have: ((V + N) & C1) | (V & C2) + ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 + ;; replace with V+N. + %C1 = xor i399 274877906943, -1 ;; C2 = 274877906943 + %N = and i399 %M, 18446742974197923840 + %A = add i399 %V, %N + %B = and i399 %A, %C1 + %D = and i399 %V, 274877906943 + %R = or i399 %B, %D + ret i399 %R +} + +define i777 @test5_apint(i777 %X) { +; CHECK-LABEL: @test5_apint( +; CHECK: ret i777 %X +; + %Y = or i777 %X, 0 + ret i777 %Y +} + +define i117 @test6_apint(i117 %X) { +; CHECK-LABEL: @test6_apint( +; CHECK: ret i117 -1 +; + %Y = or i117 %X, -1 + ret i117 %Y +} + diff --git a/test/Transforms/LoopIdiom/ARM/ctlz.ll b/test/Transforms/LoopIdiom/ARM/ctlz.ll new file mode 100644 index 000000000000..281d97c8c338 --- /dev/null +++ b/test/Transforms/LoopIdiom/ARM/ctlz.ll @@ -0,0 +1,185 @@ +; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s +; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s + +; Recognize CTLZ builtin pattern. +; Here we'll just convert loop to countable, +; so do not insert builtin if CPU do not support CTLZ +; +; int ctlz_and_other(int n, char *a) +; { +; int i = 0, n0 = n; +; while(n >>= 1) { +; a[i] = (n0 & (1 << i)) ? 1 : 0; +; i++; +; } +; return i; +; } +; +; LZCNT: entry +; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true) +; LZCNT-NEXT: %1 = sub i32 32, %0 +; LZCNT-NEXT: %2 = zext i32 %1 to i64 +; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ] +; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32 +; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ] +; LZCNT: ret i32 %i.0.lcssa + +; NOLZCNT: entry +; NOLZCNT-NOT: @llvm.ctlz + +; Function Attrs: norecurse nounwind uwtable +define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) { +entry: + %shr8 = ashr i32 %n, 1 + %tobool9 = icmp eq i32 %shr8, 0 + br i1 %tobool9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ] + %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ] + %0 = trunc i64 %indvars.iv to i32 + %shl = shl i32 1, %0 + %and = and i32 %shl, %n + %tobool1 = icmp ne i32 %and, 0 + %conv = zext i1 %tobool1 to i8 + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + store i8 %conv, i8* %arrayidx, align 1 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %shr = ashr i32 %shr11, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %1 = trunc i64 %indvars.iv.next to i32 + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_zero_check(int n) +; { +; int i = 0; +; while(n) { +; n >>= 1; +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true) +; ALL-NEXT: %1 = sub i32 32, %0 +; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] +; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_zero_check(i32 %n) { +entry: + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = ashr i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz(int n) +; { +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = ashr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz(i32 %n) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_add(int n, int i0) +; { +; int i = i0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = ashr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL-NEXT: %4 = add i32 %2, %i0 +; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ] +; ALL: ret i32 %i.0.lcssa +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_add(i32 %n, i32 %i0) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} diff --git a/test/Transforms/LoopIdiom/X86/ctlz.ll b/test/Transforms/LoopIdiom/X86/ctlz.ll new file mode 100644 index 000000000000..d8daa3a9bbab --- /dev/null +++ b/test/Transforms/LoopIdiom/X86/ctlz.ll @@ -0,0 +1,185 @@ +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s +; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s + +; Recognize CTLZ builtin pattern. +; Here we'll just convert loop to countable, +; so do not insert builtin if CPU do not support CTLZ +; +; int ctlz_and_other(int n, char *a) +; { +; int i = 0, n0 = n; +; while(n >>= 1) { +; a[i] = (n0 & (1 << i)) ? 1 : 0; +; i++; +; } +; return i; +; } +; +; LZCNT: entry +; LZCNT: %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true) +; LZCNT-NEXT: %1 = sub i32 32, %0 +; LZCNT-NEXT: %2 = zext i32 %1 to i64 +; LZCNT: %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ] +; LZCNT: %4 = trunc i64 %indvars.iv.next.lcssa to i32 +; LZCNT: %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ] +; LZCNT: ret i32 %i.0.lcssa + +; NOLZCNT: entry +; NOLZCNT-NOT: @llvm.ctlz + +; Function Attrs: norecurse nounwind uwtable +define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) { +entry: + %shr8 = ashr i32 %n, 1 + %tobool9 = icmp eq i32 %shr8, 0 + br i1 %tobool9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ] + %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ] + %0 = trunc i64 %indvars.iv to i32 + %shl = shl i32 1, %0 + %and = and i32 %shl, %n + %tobool1 = icmp ne i32 %and, 0 + %conv = zext i1 %tobool1 to i8 + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + store i8 %conv, i8* %arrayidx, align 1 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %shr = ashr i32 %shr11, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %1 = trunc i64 %indvars.iv.next to i32 + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_zero_check(int n) +; { +; int i = 0; +; while(n) { +; n >>= 1; +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true) +; ALL-NEXT: %1 = sub i32 32, %0 +; ALL: %inc.lcssa = phi i32 [ %1, %while.body ] +; ALL: %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_zero_check(i32 %n) { +entry: + %tobool4 = icmp eq i32 %n, 0 + br i1 %tobool4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ] + %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ] + %shr = ashr i32 %n.addr.05, 1 + %inc = add nsw i32 %i.06, 1 + %tobool = icmp eq i32 %shr, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz(int n) +; { +; int i = 0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = ashr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL: %i.0.lcssa = phi i32 [ %2, %while.cond ] +; ALL: ret i32 %i.0.lcssa + +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz(i32 %n) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} + +; Recognize CTLZ builtin pattern. +; Here it will replace the loop - +; assume builtin is always profitable. +; +; int ctlz_add(int n, int i0) +; { +; int i = i0; +; while(n >>= 1) { +; i++; +; } +; return i; +; } +; +; ALL: entry +; ALL: %0 = ashr i32 %n, 1 +; ALL-NEXT: %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false) +; ALL-NEXT: %2 = sub i32 32, %1 +; ALL-NEXT: %3 = add i32 %2, 1 +; ALL-NEXT: %4 = add i32 %2, %i0 +; ALL: %i.0.lcssa = phi i32 [ %4, %while.cond ] +; ALL: ret i32 %i.0.lcssa +; +; Function Attrs: norecurse nounwind readnone uwtable +define i32 @ctlz_add(i32 %n, i32 %i0) { +entry: + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ] + %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ] + %shr = ashr i32 %n.addr.0, 1 + %tobool = icmp eq i32 %shr, 0 + %inc = add nsw i32 %i.0, 1 + br i1 %tobool, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret i32 %i.0 +} diff --git a/test/Transforms/LoopUnroll/not-rotated.ll b/test/Transforms/LoopUnroll/not-rotated.ll index ffe80920d948..b4b88e096079 100644 --- a/test/Transforms/LoopUnroll/not-rotated.ll +++ b/test/Transforms/LoopUnroll/not-rotated.ll @@ -4,7 +4,7 @@ ; properly handled by LoopUnroll, currently. ; RUN: opt -loop-unroll -verify-dom-info %s -; REQUIRE: asserts +; REQUIRES: asserts define void @tinkywinky(i1 %patatino) { entry: diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll new file mode 100644 index 000000000000..5a4bfe5e6bdd --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll @@ -0,0 +1,187 @@ +; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s + +; Test to verify that when math headers are built with +; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite +; function versions, vectorization can map these to vector versions. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare float @__expf_finite(float) #0 + +; CHECK-LABEL: @exp_f32 +; CHECK: <4 x float> @__svml_expf4 +; CHECK: ret +define void @exp_f32(float* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__expf_finite(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv + store float %call, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 + +for.end: ; preds = %for.body + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__exp_finite(double) #0 + +; CHECK-LABEL: @exp_f64 +; CHECK: <4 x double> @__svml_exp4 +; CHECK: ret +define void @exp_f64(double* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__exp_finite(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv + store double %call, double* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11 + +for.end: ; preds = %for.body + ret void +} + +!11 = distinct !{!11, !12, !13} +!12 = !{!"llvm.loop.vectorize.width", i32 4} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} + + + + +declare float @__logf_finite(float) #0 + +; CHECK-LABEL: @log_f32 +; CHECK: <4 x float> @__svml_logf4 +; CHECK: ret +define void @log_f32(float* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call fast float @__logf_finite(float %conv) + %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv + store float %call, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: ; preds = %for.body + ret void +} + +!21 = distinct !{!21, !22, !23} +!22 = !{!"llvm.loop.vectorize.width", i32 4} +!23 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__log_finite(double) #0 + +; CHECK-LABEL: @log_f64 +; CHECK: <4 x double> @__svml_log4 +; CHECK: ret +define void @log_f64(double* nocapture %varray) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call fast double @__log_finite(double %conv) + %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv + store double %call, double* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: ; preds = %for.body + ret void +} + +!31 = distinct !{!31, !32, !33} +!32 = !{!"llvm.loop.vectorize.width", i32 4} +!33 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare float @__powf_finite(float, float) #0 + +; CHECK-LABEL: @pow_f32 +; CHECK: <4 x float> @__svml_powf4 +; CHECK: ret +define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to float + %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv + %tmp1 = load float, float* %arrayidx, align 4 + %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1) + %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv + store float %tmp2, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41 + +for.end: ; preds = %for.body + ret void +} + +!41 = distinct !{!41, !42, !43} +!42 = !{!"llvm.loop.vectorize.width", i32 4} +!43 = !{!"llvm.loop.vectorize.enable", i1 true} + + +declare double @__pow_finite(double, double) #0 + +; CHECK-LABEL: @pow_f64 +; CHECK: <4 x double> @__svml_pow4 +; CHECK: ret +define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %tmp = trunc i64 %indvars.iv to i32 + %conv = sitofp i32 %tmp to double + %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv + %tmp1 = load double, double* %arrayidx, align 4 + %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1) + %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv + store double %tmp2, double* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51 + +for.end: ; preds = %for.body + ret void +} + +!51 = distinct !{!51, !52, !53} +!52 = !{!"llvm.loop.vectorize.width", i32 4} +!53 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll index 6507166dd1f2..7e9e6b1cdc8e 100644 --- a/test/Transforms/LoopVectorize/induction.ll +++ b/test/Transforms/LoopVectorize/induction.ll @@ -849,3 +849,48 @@ for.end: %tmp7 = phi i32 [ %tmp6, %for.inc ] ret i32 %tmp7 } + +; Ensure that the shuffle vector for first order recurrence is inserted +; correctly after all the phis. These new phis correspond to new IVs +; that are generated by optimizing non-free truncs of IVs to IVs themselves +define i64 @trunc_with_first_order_recurrence() { +; CHECK-LABEL: trunc_with_first_order_recurrence +; CHECK-LABEL: vector.body: +; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %vec.phi = phi <2 x i64> +; CHECK-NEXT: %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK-NEXT: %vec.ind2 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next3, %vector.body ] +; CHECK-NEXT: %vector.recur = phi <2 x i32> [ <i32 undef, i32 42>, %vector.ph ], [ %vec.ind5, %vector.body ] +; CHECK-NEXT: %vec.ind5 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next6, %vector.body ] +; CHECK-NEXT: %vec.ind7 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next8, %vector.body ] +; CHECK-NEXT: shufflevector <2 x i32> %vector.recur, <2 x i32> %vec.ind5, <2 x i32> <i32 1, i32 2> +entry: + br label %loop + +exit: ; preds = %loop + %.lcssa = phi i64 [ %c23, %loop ] + ret i64 %.lcssa + +loop: ; preds = %loop, %entry + %c5 = phi i64 [ %c23, %loop ], [ 0, %entry ] + %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 1, %entry ] + %x = phi i32 [ %c24, %loop ], [ 1, %entry ] + %y = phi i32 [ %c6, %loop ], [ 42, %entry ] + %c6 = trunc i64 %indvars.iv to i32 + %c8 = mul i32 %x, %c6 + %c9 = add i32 %c8, 42 + %c10 = add i32 %y, %c6 + %c11 = add i32 %c10, %c9 + %c12 = sext i32 %c11 to i64 + %c13 = add i64 %c5, %c12 + %indvars.iv.tr = trunc i64 %indvars.iv to i32 + %c14 = shl i32 %indvars.iv.tr, 1 + %c15 = add i32 %c9, %c14 + %c16 = sext i32 %c15 to i64 + %c23 = add i64 %c13, %c16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %c24 = add nuw nsw i32 %x, 1 + %exitcond.i = icmp eq i64 %indvars.iv.next, 114 + br i1 %exitcond.i, label %exit, label %loop + +} diff --git a/test/Transforms/LoopVectorize/pr32859.ll b/test/Transforms/LoopVectorize/pr32859.ll new file mode 100644 index 000000000000..31cb84699f71 --- /dev/null +++ b/test/Transforms/LoopVectorize/pr32859.ll @@ -0,0 +1,30 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]' +; but the IR Verifier requires for PHI one entry for each predecessor of +; it's parent basic block. The original PR14725 solution for the issue just +; added 'undef' for an predecessor BB and which is not correct. We copy the real +; value for another predecessor instead of bringing 'undef'. + +; CHECK-LABEL: for.cond.preheader: +; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ] + +; Function Attrs: nounwind uwtable +define void @main() #0 { +entry: + br label %for.cond1.preheader.i + +for.cond1.preheader.i: ; preds = %if.end.2.i, %entry + %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ] + %tobool.i = icmp ne i32 undef, 0 + br label %if.end.2.i + +if.end.2.i: ; preds = %for.cond1.preheader.i + %inc5.i = add nsw i32 %c.06.i, 1 + %cmp.i = icmp slt i32 %inc5.i, 16 + br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader + +for.cond.preheader: ; preds = %if.end.2.i + %e.0.ph = phi i32 [ 0, %if.end.2.i ] + unreachable +} diff --git a/test/Transforms/NewGVN/pr32934.ll b/test/Transforms/NewGVN/pr32934.ll new file mode 100644 index 000000000000..4bb7ea150437 --- /dev/null +++ b/test/Transforms/NewGVN/pr32934.ll @@ -0,0 +1,69 @@ +; REQUIRES: disabled +; RUN: opt -S -newgvn %s | FileCheck %s + +; CHECK: define void @tinkywinky() { +; CHECK-NEXT: entry: +; CHECK-NEXT: %d = alloca i32, align 4 +; CHECK-NEXT: store i32 0, i32* null, align 4 +; CHECK-NEXT: br label %for.cond +; CHECK: for.cond: ; preds = %if.end, %entry +; CHECK-NEXT: %0 = load i32, i32* null, align 4 +; CHECK-NEXT: %cmp = icmp slt i32 %0, 1 +; CHECK-NEXT: br i1 %cmp, label %for.body, label %while.cond +; CHECK: for.body: ; preds = %for.cond +; CHECK-NEXT: %1 = load i32, i32* @a, align 4 +; CHECK-NEXT: store i32 %1, i32* %d, align 4 +; CHECK-NEXT: br label %L +; CHECK: L: ; preds = %if.then, %for.body +; CHECK-NEXT: %tobool = icmp ne i32 %1, 0 +; CHECK-NEXT: br i1 %tobool, label %if.then, label %if.end +; CHECK: if.then: ; preds = %L +; CHECK-NEXT: call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0)) +; CHECK-NEXT: br label %L +; CHECK: if.end: ; preds = %L +; CHECK-NEXT: br label %for.cond +; CHECK: while.cond: ; preds = %while.body, %for.cond +; CHECK-NEXT: br i1 undef, label %while.body, label %while.end +; CHECK: while.body: ; preds = %while.cond +; CHECK-NEXT: call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0)) +; CHECK-NEXT: br label %while.cond +; CHECK: while.end: +; CHECK-NEXT: %2 = load i32, i32* @a, align 4 +; CHECK-NEXT: store i32 %2, i32* undef, align 4 +; CHECK-NEXT: ret void + +@a = external global i32, align 4 +@patatino = external unnamed_addr constant [2 x i8], align 1 +define void @tinkywinky() { +entry: + %d = alloca i32, align 4 + store i32 0, i32* null, align 4 + br label %for.cond +for.cond: + %0 = load i32, i32* null, align 4 + %cmp = icmp slt i32 %0, 1 + br i1 %cmp, label %for.body, label %while.cond +for.body: + %1 = load i32, i32* @a, align 4 + store i32 %1, i32* %d, align 4 + br label %L +L: + %2 = load i32, i32* %d, align 4 + %tobool = icmp ne i32 %2, 0 + br i1 %tobool, label %if.then, label %if.end +if.then: + call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0)) + br label %L +if.end: + br label %for.cond +while.cond: + br i1 undef, label %while.body, label %while.end +while.body: + call void (i8*, ...) @printf(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @patatino, i32 0, i32 0)) + br label %while.cond +while.end: + %3 = load i32, i32* @a, align 4 + store i32 %3, i32* undef, align 4 + ret void +} +declare void @printf(i8*, ...) #1 diff --git a/test/Transforms/NewGVN/pr32952.ll b/test/Transforms/NewGVN/pr32952.ll new file mode 100644 index 000000000000..056b3a5105ec --- /dev/null +++ b/test/Transforms/NewGVN/pr32952.ll @@ -0,0 +1,42 @@ +; PR32952: Don't erroneously consider congruent two phi nodes which +; have the same arguments but different incoming edges. +; RUN: opt -newgvn -S %s | FileCheck %s + +@a = common global i16 0, align 2 +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 + +define i32 @tinkywinky() { +entry: + %0 = load i16, i16* @a, align 2 + %conv = sext i16 %0 to i32 + %neg = xor i32 %conv, -1 + %conv1 = trunc i32 %neg to i16 + %conv3 = zext i16 %conv1 to i32 + %cmp = icmp slt i32 %conv, %conv3 + br i1 %cmp, label %tinky, label %winky + +tinky: + store i16 2, i16* @a, align 2 + br label %patatino + +winky: + br label %patatino + +patatino: +; CHECK: %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ] +; CHECK: %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ] + %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ] + %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ] + br label %end + +end: +; CHECK: %promoted = zext i16 %banana to i32 +; CHECK: %other = zext i16 %meh to i32 + %promoted = zext i16 %banana to i32 + %other = zext i16 %meh to i32 + %first = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %promoted) + %second = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %other) + ret i32 0 +} + +declare i32 @printf(i8*, ...) diff --git a/test/Transforms/NewGVN/verify-memoryphi.ll b/test/Transforms/NewGVN/verify-memoryphi.ll new file mode 100644 index 000000000000..57dbd18986d2 --- /dev/null +++ b/test/Transforms/NewGVN/verify-memoryphi.ll @@ -0,0 +1,29 @@ +; Skip dead MemoryPhis when performing memory congruency verification +; in NewGVN. +; RUN: opt -S -newgvn %s | FileCheck %s +; REQUIRES: asserts + +; CHECK: define void @tinkywinky() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label %body, label %end +; CHECK: body: +; CHECK-NEXT: store i8 undef, i8* null +; CHECK-NEXT: br label %end +; CHECK: end: +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) + +define void @tinkywinky() { +entry: + call void @llvm.lifetime.start.p0i8(i64 4, i8* undef) + br i1 false, label %body, label %end + +body: + call void @llvm.lifetime.start.p0i8(i64 4, i8* undef) + br label %end + +end: + ret void +} diff --git a/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll new file mode 100644 index 000000000000..edc8042a217d --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll @@ -0,0 +1,22 @@ +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s +; Currently disabled for a few subtargets (e.g. Kryo): +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s + +define void @f(float* %r, float* %w) { + %r0 = getelementptr inbounds float, float* %r, i64 0 + %r1 = getelementptr inbounds float, float* %r, i64 1 + %f0 = load float, float* %r0 + %f1 = load float, float* %r1 + %add0 = fadd float %f0, %f0 +; CHECK: fadd <2 x float> +; NO_SLP: fadd float +; NO_SLP: fadd float + %add1 = fadd float %f1, %f1 + %w0 = getelementptr inbounds float, float* %w, i64 0 + %w1 = getelementptr inbounds float, float* %w, i64 1 + store float %add0, float* %w0 + store float %add1, float* %w1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index e9b71963530c..962a6c3b57b3 100644 --- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=YAML %s target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu" ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]] ; CHECK: sext i32 [[X]] to i64 -; + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_4x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '11' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '5' + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_4x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '16' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { entry: %cmp31 = icmp sgt i32 %n, 0 @@ -69,7 +88,25 @@ for.body: ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]] ; CHECK: sext i32 [[X]] to i64 -; + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_2x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '11' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '5' + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_2x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '6' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { entry: %cmp31 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll index 8f8bf2648aa2..1a6a2fb890d3 100644 --- a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,4 +1,5 @@ -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=YAML %s ; FIXME: The threshold is changed to keep this test case a bit smaller. ; The AArch64 cost model should not give such high costs to select statements. @@ -10,6 +11,16 @@ target triple = "aarch64--linux" ; CHECK: load <4 x i32> ; CHECK: load <4 x i32> ; CHECK: select <4 x i1> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: test_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '4' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '8' + define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { entry: %cmp.22 = icmp sgt i32 %h, 0 @@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia ; CHECK: load <4 x i32> ; CHECK: load <4 x i32> ; CHECK: mul nsw <4 x i32> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: reduction_with_br +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '1' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + entry: %cmp.16 = icmp sgt i32 %h, 0 br i1 %cmp.16, label %for.body.lr.ph, label %for.end @@ -150,6 +171,16 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK: load <8 x i8> ; CHECK: load <8 x i8> ; CHECK: select <8 x i1> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: test_unrolled_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '-33' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '10' + define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { entry: %cmp.43 = icmp sgt i32 %h, 0 diff --git a/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/test/Transforms/SLPVectorizer/AArch64/remarks.ll new file mode 100644 index 000000000000..e8c37512594e --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/remarks.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s + +define void @f(double* %r, double* %w) { + %r0 = getelementptr inbounds double, double* %r, i64 0 + %r1 = getelementptr inbounds double, double* %r, i64 1 + %f0 = load double, double* %r0 + %f1 = load double, double* %r1 + %add0 = fadd double %f0, %f0 + %add1 = fadd double %f1, %f1 + %w0 = getelementptr inbounds double, double* %w, i64 0 + %w1 = getelementptr inbounds double, double* %w, i64 1 +; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3 + store double %add0, double* %w0, !dbg !9 + store double %add1, double* %w1 + ret void +} + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2) +!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"PIC Level", i32 2} +!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"} +!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 5, column: 10, scope: !7) diff --git a/test/Transforms/SLPVectorizer/X86/arith-add.ll b/test/Transforms/SLPVectorizer/X86/arith-add.ll new file mode 100644 index 000000000000..0266758b27d2 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -0,0 +1,649 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @add_v8i64() { +; SSE-LABEL: @add_v8i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; AVX-LABEL: @add_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @add_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = add i64 %a0, %b0 + %r1 = add i64 %a1, %b1 + %r2 = add i64 %a2, %b2 + %r3 = add i64 %a3, %b3 + %r4 = add i64 %a4, %b4 + %r5 = add i64 %a5, %b5 + %r6 = add i64 %a6, %b6 + %r7 = add i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @add_v16i32() { +; SSE-LABEL: @add_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @add_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @add_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = add i32 %a0 , %b0 + %r1 = add i32 %a1 , %b1 + %r2 = add i32 %a2 , %b2 + %r3 = add i32 %a3 , %b3 + %r4 = add i32 %a4 , %b4 + %r5 = add i32 %a5 , %b5 + %r6 = add i32 %a6 , %b6 + %r7 = add i32 %a7 , %b7 + %r8 = add i32 %a8 , %b8 + %r9 = add i32 %a9 , %b9 + %r10 = add i32 %a10, %b10 + %r11 = add i32 %a11, %b11 + %r12 = add i32 %a12, %b12 + %r13 = add i32 %a13, %b13 + %r14 = add i32 %a14, %b14 + %r15 = add i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @add_v32i16() { +; SSE-LABEL: @add_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @add_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @add_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = add i16 %a0 , %b0 + %r1 = add i16 %a1 , %b1 + %r2 = add i16 %a2 , %b2 + %r3 = add i16 %a3 , %b3 + %r4 = add i16 %a4 , %b4 + %r5 = add i16 %a5 , %b5 + %r6 = add i16 %a6 , %b6 + %r7 = add i16 %a7 , %b7 + %r8 = add i16 %a8 , %b8 + %r9 = add i16 %a9 , %b9 + %r10 = add i16 %a10, %b10 + %r11 = add i16 %a11, %b11 + %r12 = add i16 %a12, %b12 + %r13 = add i16 %a13, %b13 + %r14 = add i16 %a14, %b14 + %r15 = add i16 %a15, %b15 + %r16 = add i16 %a16, %b16 + %r17 = add i16 %a17, %b17 + %r18 = add i16 %a18, %b18 + %r19 = add i16 %a19, %b19 + %r20 = add i16 %a20, %b20 + %r21 = add i16 %a21, %b21 + %r22 = add i16 %a22, %b22 + %r23 = add i16 %a23, %b23 + %r24 = add i16 %a24, %b24 + %r25 = add i16 %a25, %b25 + %r26 = add i16 %a26, %b26 + %r27 = add i16 %a27, %b27 + %r28 = add i16 %a28, %b28 + %r29 = add i16 %a29, %b29 + %r30 = add i16 %a30, %b30 + %r31 = add i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @add_v64i8() { +; CHECK-LABEL: @add_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = add i8 %a0 , %b0 + %r1 = add i8 %a1 , %b1 + %r2 = add i8 %a2 , %b2 + %r3 = add i8 %a3 , %b3 + %r4 = add i8 %a4 , %b4 + %r5 = add i8 %a5 , %b5 + %r6 = add i8 %a6 , %b6 + %r7 = add i8 %a7 , %b7 + %r8 = add i8 %a8 , %b8 + %r9 = add i8 %a9 , %b9 + %r10 = add i8 %a10, %b10 + %r11 = add i8 %a11, %b11 + %r12 = add i8 %a12, %b12 + %r13 = add i8 %a13, %b13 + %r14 = add i8 %a14, %b14 + %r15 = add i8 %a15, %b15 + %r16 = add i8 %a16, %b16 + %r17 = add i8 %a17, %b17 + %r18 = add i8 %a18, %b18 + %r19 = add i8 %a19, %b19 + %r20 = add i8 %a20, %b20 + %r21 = add i8 %a21, %b21 + %r22 = add i8 %a22, %b22 + %r23 = add i8 %a23, %b23 + %r24 = add i8 %a24, %b24 + %r25 = add i8 %a25, %b25 + %r26 = add i8 %a26, %b26 + %r27 = add i8 %a27, %b27 + %r28 = add i8 %a28, %b28 + %r29 = add i8 %a29, %b29 + %r30 = add i8 %a30, %b30 + %r31 = add i8 %a31, %b31 + %r32 = add i8 %a32, %b32 + %r33 = add i8 %a33, %b33 + %r34 = add i8 %a34, %b34 + %r35 = add i8 %a35, %b35 + %r36 = add i8 %a36, %b36 + %r37 = add i8 %a37, %b37 + %r38 = add i8 %a38, %b38 + %r39 = add i8 %a39, %b39 + %r40 = add i8 %a40, %b40 + %r41 = add i8 %a41, %b41 + %r42 = add i8 %a42, %b42 + %r43 = add i8 %a43, %b43 + %r44 = add i8 %a44, %b44 + %r45 = add i8 %a45, %b45 + %r46 = add i8 %a46, %b46 + %r47 = add i8 %a47, %b47 + %r48 = add i8 %a48, %b48 + %r49 = add i8 %a49, %b49 + %r50 = add i8 %a50, %b50 + %r51 = add i8 %a51, %b51 + %r52 = add i8 %a52, %b52 + %r53 = add i8 %a53, %b53 + %r54 = add i8 %a54, %b54 + %r55 = add i8 %a55, %b55 + %r56 = add i8 %a56, %b56 + %r57 = add i8 %a57, %b57 + %r58 = add i8 %a58, %b58 + %r59 = add i8 %a59, %b59 + %r60 = add i8 %a60, %b60 + %r61 = add i8 %a61, %b61 + %r62 = add i8 %a62, %b62 + %r63 = add i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/test/Transforms/SLPVectorizer/X86/arith-mul.ll new file mode 100644 index 000000000000..95875d7f01fd --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -0,0 +1,700 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @mul_v8i64() { +; SSE-LABEL: @mul_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] +; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @mul_v8i64( +; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]] +; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @mul_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @mul_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = mul i64 %a0, %b0 + %r1 = mul i64 %a1, %b1 + %r2 = mul i64 %a2, %b2 + %r3 = mul i64 %a3, %b3 + %r4 = mul i64 %a4, %b4 + %r5 = mul i64 %a5, %b5 + %r6 = mul i64 %a6, %b6 + %r7 = mul i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @mul_v16i32() { +; SSE-LABEL: @mul_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @mul_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @mul_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = mul i32 %a0 , %b0 + %r1 = mul i32 %a1 , %b1 + %r2 = mul i32 %a2 , %b2 + %r3 = mul i32 %a3 , %b3 + %r4 = mul i32 %a4 , %b4 + %r5 = mul i32 %a5 , %b5 + %r6 = mul i32 %a6 , %b6 + %r7 = mul i32 %a7 , %b7 + %r8 = mul i32 %a8 , %b8 + %r9 = mul i32 %a9 , %b9 + %r10 = mul i32 %a10, %b10 + %r11 = mul i32 %a11, %b11 + %r12 = mul i32 %a12, %b12 + %r13 = mul i32 %a13, %b13 + %r14 = mul i32 %a14, %b14 + %r15 = mul i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @mul_v32i16() { +; SSE-LABEL: @mul_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @mul_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @mul_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = mul i16 %a0 , %b0 + %r1 = mul i16 %a1 , %b1 + %r2 = mul i16 %a2 , %b2 + %r3 = mul i16 %a3 , %b3 + %r4 = mul i16 %a4 , %b4 + %r5 = mul i16 %a5 , %b5 + %r6 = mul i16 %a6 , %b6 + %r7 = mul i16 %a7 , %b7 + %r8 = mul i16 %a8 , %b8 + %r9 = mul i16 %a9 , %b9 + %r10 = mul i16 %a10, %b10 + %r11 = mul i16 %a11, %b11 + %r12 = mul i16 %a12, %b12 + %r13 = mul i16 %a13, %b13 + %r14 = mul i16 %a14, %b14 + %r15 = mul i16 %a15, %b15 + %r16 = mul i16 %a16, %b16 + %r17 = mul i16 %a17, %b17 + %r18 = mul i16 %a18, %b18 + %r19 = mul i16 %a19, %b19 + %r20 = mul i16 %a20, %b20 + %r21 = mul i16 %a21, %b21 + %r22 = mul i16 %a22, %b22 + %r23 = mul i16 %a23, %b23 + %r24 = mul i16 %a24, %b24 + %r25 = mul i16 %a25, %b25 + %r26 = mul i16 %a26, %b26 + %r27 = mul i16 %a27, %b27 + %r28 = mul i16 %a28, %b28 + %r29 = mul i16 %a29, %b29 + %r30 = mul i16 %a30, %b30 + %r31 = mul i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @mul_v64i8() { +; CHECK-LABEL: @mul_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = mul i8 %a0 , %b0 + %r1 = mul i8 %a1 , %b1 + %r2 = mul i8 %a2 , %b2 + %r3 = mul i8 %a3 , %b3 + %r4 = mul i8 %a4 , %b4 + %r5 = mul i8 %a5 , %b5 + %r6 = mul i8 %a6 , %b6 + %r7 = mul i8 %a7 , %b7 + %r8 = mul i8 %a8 , %b8 + %r9 = mul i8 %a9 , %b9 + %r10 = mul i8 %a10, %b10 + %r11 = mul i8 %a11, %b11 + %r12 = mul i8 %a12, %b12 + %r13 = mul i8 %a13, %b13 + %r14 = mul i8 %a14, %b14 + %r15 = mul i8 %a15, %b15 + %r16 = mul i8 %a16, %b16 + %r17 = mul i8 %a17, %b17 + %r18 = mul i8 %a18, %b18 + %r19 = mul i8 %a19, %b19 + %r20 = mul i8 %a20, %b20 + %r21 = mul i8 %a21, %b21 + %r22 = mul i8 %a22, %b22 + %r23 = mul i8 %a23, %b23 + %r24 = mul i8 %a24, %b24 + %r25 = mul i8 %a25, %b25 + %r26 = mul i8 %a26, %b26 + %r27 = mul i8 %a27, %b27 + %r28 = mul i8 %a28, %b28 + %r29 = mul i8 %a29, %b29 + %r30 = mul i8 %a30, %b30 + %r31 = mul i8 %a31, %b31 + %r32 = mul i8 %a32, %b32 + %r33 = mul i8 %a33, %b33 + %r34 = mul i8 %a34, %b34 + %r35 = mul i8 %a35, %b35 + %r36 = mul i8 %a36, %b36 + %r37 = mul i8 %a37, %b37 + %r38 = mul i8 %a38, %b38 + %r39 = mul i8 %a39, %b39 + %r40 = mul i8 %a40, %b40 + %r41 = mul i8 %a41, %b41 + %r42 = mul i8 %a42, %b42 + %r43 = mul i8 %a43, %b43 + %r44 = mul i8 %a44, %b44 + %r45 = mul i8 %a45, %b45 + %r46 = mul i8 %a46, %b46 + %r47 = mul i8 %a47, %b47 + %r48 = mul i8 %a48, %b48 + %r49 = mul i8 %a49, %b49 + %r50 = mul i8 %a50, %b50 + %r51 = mul i8 %a51, %b51 + %r52 = mul i8 %a52, %b52 + %r53 = mul i8 %a53, %b53 + %r54 = mul i8 %a54, %b54 + %r55 = mul i8 %a55, %b55 + %r56 = mul i8 %a56, %b56 + %r57 = mul i8 %a57, %b57 + %r58 = mul i8 %a58, %b58 + %r59 = mul i8 %a59, %b59 + %r60 = mul i8 %a60, %b60 + %r61 = mul i8 %a61, %b61 + %r62 = mul i8 %a62, %b62 + %r63 = mul i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/test/Transforms/SLPVectorizer/X86/arith-sub.ll new file mode 100644 index 000000000000..85838369e226 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -0,0 +1,649 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @sub_v8i64() { +; SSE-LABEL: @sub_v8i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sub_v8i64( +; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @sub_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = sub i64 %a0, %b0 + %r1 = sub i64 %a1, %b1 + %r2 = sub i64 %a2, %b2 + %r3 = sub i64 %a3, %b3 + %r4 = sub i64 %a4, %b4 + %r5 = sub i64 %a5, %b5 + %r6 = sub i64 %a6, %b6 + %r7 = sub i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @sub_v16i32() { +; SSE-LABEL: @sub_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sub_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @sub_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = sub i32 %a0 , %b0 + %r1 = sub i32 %a1 , %b1 + %r2 = sub i32 %a2 , %b2 + %r3 = sub i32 %a3 , %b3 + %r4 = sub i32 %a4 , %b4 + %r5 = sub i32 %a5 , %b5 + %r6 = sub i32 %a6 , %b6 + %r7 = sub i32 %a7 , %b7 + %r8 = sub i32 %a8 , %b8 + %r9 = sub i32 %a9 , %b9 + %r10 = sub i32 %a10, %b10 + %r11 = sub i32 %a11, %b11 + %r12 = sub i32 %a12, %b12 + %r13 = sub i32 %a13, %b13 + %r14 = sub i32 %a14, %b14 + %r15 = sub i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @sub_v32i16() { +; SSE-LABEL: @sub_v32i16( +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @sub_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @sub_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = sub i16 %a0 , %b0 + %r1 = sub i16 %a1 , %b1 + %r2 = sub i16 %a2 , %b2 + %r3 = sub i16 %a3 , %b3 + %r4 = sub i16 %a4 , %b4 + %r5 = sub i16 %a5 , %b5 + %r6 = sub i16 %a6 , %b6 + %r7 = sub i16 %a7 , %b7 + %r8 = sub i16 %a8 , %b8 + %r9 = sub i16 %a9 , %b9 + %r10 = sub i16 %a10, %b10 + %r11 = sub i16 %a11, %b11 + %r12 = sub i16 %a12, %b12 + %r13 = sub i16 %a13, %b13 + %r14 = sub i16 %a14, %b14 + %r15 = sub i16 %a15, %b15 + %r16 = sub i16 %a16, %b16 + %r17 = sub i16 %a17, %b17 + %r18 = sub i16 %a18, %b18 + %r19 = sub i16 %a19, %b19 + %r20 = sub i16 %a20, %b20 + %r21 = sub i16 %a21, %b21 + %r22 = sub i16 %a22, %b22 + %r23 = sub i16 %a23, %b23 + %r24 = sub i16 %a24, %b24 + %r25 = sub i16 %a25, %b25 + %r26 = sub i16 %a26, %b26 + %r27 = sub i16 %a27, %b27 + %r28 = sub i16 %a28, %b28 + %r29 = sub i16 %a29, %b29 + %r30 = sub i16 %a30, %b30 + %r31 = sub i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @sub_v64i8() { +; CHECK-LABEL: @sub_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = sub i8 %a0 , %b0 + %r1 = sub i8 %a1 , %b1 + %r2 = sub i8 %a2 , %b2 + %r3 = sub i8 %a3 , %b3 + %r4 = sub i8 %a4 , %b4 + %r5 = sub i8 %a5 , %b5 + %r6 = sub i8 %a6 , %b6 + %r7 = sub i8 %a7 , %b7 + %r8 = sub i8 %a8 , %b8 + %r9 = sub i8 %a9 , %b9 + %r10 = sub i8 %a10, %b10 + %r11 = sub i8 %a11, %b11 + %r12 = sub i8 %a12, %b12 + %r13 = sub i8 %a13, %b13 + %r14 = sub i8 %a14, %b14 + %r15 = sub i8 %a15, %b15 + %r16 = sub i8 %a16, %b16 + %r17 = sub i8 %a17, %b17 + %r18 = sub i8 %a18, %b18 + %r19 = sub i8 %a19, %b19 + %r20 = sub i8 %a20, %b20 + %r21 = sub i8 %a21, %b21 + %r22 = sub i8 %a22, %b22 + %r23 = sub i8 %a23, %b23 + %r24 = sub i8 %a24, %b24 + %r25 = sub i8 %a25, %b25 + %r26 = sub i8 %a26, %b26 + %r27 = sub i8 %a27, %b27 + %r28 = sub i8 %a28, %b28 + %r29 = sub i8 %a29, %b29 + %r30 = sub i8 %a30, %b30 + %r31 = sub i8 %a31, %b31 + %r32 = sub i8 %a32, %b32 + %r33 = sub i8 %a33, %b33 + %r34 = sub i8 %a34, %b34 + %r35 = sub i8 %a35, %b35 + %r36 = sub i8 %a36, %b36 + %r37 = sub i8 %a37, %b37 + %r38 = sub i8 %a38, %b38 + %r39 = sub i8 %a39, %b39 + %r40 = sub i8 %a40, %b40 + %r41 = sub i8 %a41, %b41 + %r42 = sub i8 %a42, %b42 + %r43 = sub i8 %a43, %b43 + %r44 = sub i8 %a44, %b44 + %r45 = sub i8 %a45, %b45 + %r46 = sub i8 %a46, %b46 + %r47 = sub i8 %a47, %b47 + %r48 = sub i8 %a48, %b48 + %r49 = sub i8 %a49, %b49 + %r50 = sub i8 %a50, %b50 + %r51 = sub i8 %a51, %b51 + %r52 = sub i8 %a52, %b52 + %r53 = sub i8 %a53, %b53 + %r54 = sub i8 %a54, %b54 + %r55 = sub i8 %a55, %b55 + %r56 = sub i8 %a56, %b56 + %r57 = sub i8 %a57, %b57 + %r58 = sub i8 %a58, %b58 + %r59 = sub i8 %a59, %b59 + %r60 = sub i8 %a60, %b60 + %r61 = sub i8 %a61, %b61 + %r62 = sub i8 %a62, %b62 + %r63 = sub i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll new file mode 100644 index 000000000000..646f599ce340 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -0,0 +1,913 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @ashr_v8i64() { +; SSE-LABEL: @ashr_v8i64( +; SSE-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; SSE-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; SSE-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; SSE-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; SSE-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; SSE-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; SSE-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; SSE-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; SSE-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; SSE-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; SSE-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; SSE-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; SSE-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; SSE-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; SSE-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; SSE-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; SSE-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] +; SSE-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; SSE-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; SSE-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; SSE-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; SSE-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; SSE-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; SSE-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ashr_v8i64( +; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 +; AVX1-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 +; AVX1-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 +; AVX1-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 +; AVX1-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 +; AVX1-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 +; AVX1-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 +; AVX1-NEXT: [[R0:%.*]] = ashr i64 [[A0]], [[B0]] +; AVX1-NEXT: [[R1:%.*]] = ashr i64 [[A1]], [[B1]] +; AVX1-NEXT: [[R2:%.*]] = ashr i64 [[A2]], [[B2]] +; AVX1-NEXT: [[R3:%.*]] = ashr i64 [[A3]], [[B3]] +; AVX1-NEXT: [[R4:%.*]] = ashr i64 [[A4]], [[B4]] +; AVX1-NEXT: [[R5:%.*]] = ashr i64 [[A5]], [[B5]] +; AVX1-NEXT: [[R6:%.*]] = ashr i64 [[A6]], [[B6]] +; AVX1-NEXT: [[R7:%.*]] = ashr i64 [[A7]], [[B7]] +; AVX1-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 +; AVX1-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 +; AVX1-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 +; AVX1-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 +; AVX1-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 +; AVX1-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 +; AVX1-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 +; AVX1-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @ashr_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @ashr_v8i64( +; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = ashr i64 %a0, %b0 + %r1 = ashr i64 %a1, %b1 + %r2 = ashr i64 %a2, %b2 + %r3 = ashr i64 %a3, %b3 + %r4 = ashr i64 %a4, %b4 + %r5 = ashr i64 %a5, %b5 + %r6 = ashr i64 %a6, %b6 + %r7 = ashr i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @ashr_v16i32() { +; SSE-LABEL: @ashr_v16i32( +; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SSE-NEXT: [[R0:%.*]] = ashr i32 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = ashr i32 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = ashr i32 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = ashr i32 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = ashr i32 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = ashr i32 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = ashr i32 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = ashr i32 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = ashr i32 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = ashr i32 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = ashr i32 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = ashr i32 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = ashr i32 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = ashr i32 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = ashr i32 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = ashr i32 [[A15]], [[B15]] +; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ashr_v16i32( +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] +; AVX1-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] +; AVX1-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] +; AVX1-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] +; AVX1-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; AVX1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX1-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @ashr_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX2-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @ashr_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @ashr_v16i32( +; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = ashr i32 %a0 , %b0 + %r1 = ashr i32 %a1 , %b1 + %r2 = ashr i32 %a2 , %b2 + %r3 = ashr i32 %a3 , %b3 + %r4 = ashr i32 %a4 , %b4 + %r5 = ashr i32 %a5 , %b5 + %r6 = ashr i32 %a6 , %b6 + %r7 = ashr i32 %a7 , %b7 + %r8 = ashr i32 %a8 , %b8 + %r9 = ashr i32 %a9 , %b9 + %r10 = ashr i32 %a10, %b10 + %r11 = ashr i32 %a11, %b11 + %r12 = ashr i32 %a12, %b12 + %r13 = ashr i32 %a13, %b13 + %r14 = ashr i32 %a14, %b14 + %r15 = ashr i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @ashr_v32i16() { +; SSE-LABEL: @ashr_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = ashr i16 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = ashr i16 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = ashr i16 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = ashr i16 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = ashr i16 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = ashr i16 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = ashr i16 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = ashr i16 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = ashr i16 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = ashr i16 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = ashr i16 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = ashr i16 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = ashr i16 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = ashr i16 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = ashr i16 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = ashr i16 [[A15]], [[B15]] +; SSE-NEXT: [[R16:%.*]] = ashr i16 [[A16]], [[B16]] +; SSE-NEXT: [[R17:%.*]] = ashr i16 [[A17]], [[B17]] +; SSE-NEXT: [[R18:%.*]] = ashr i16 [[A18]], [[B18]] +; SSE-NEXT: [[R19:%.*]] = ashr i16 [[A19]], [[B19]] +; SSE-NEXT: [[R20:%.*]] = ashr i16 [[A20]], [[B20]] +; SSE-NEXT: [[R21:%.*]] = ashr i16 [[A21]], [[B21]] +; SSE-NEXT: [[R22:%.*]] = ashr i16 [[A22]], [[B22]] +; SSE-NEXT: [[R23:%.*]] = ashr i16 [[A23]], [[B23]] +; SSE-NEXT: [[R24:%.*]] = ashr i16 [[A24]], [[B24]] +; SSE-NEXT: [[R25:%.*]] = ashr i16 [[A25]], [[B25]] +; SSE-NEXT: [[R26:%.*]] = ashr i16 [[A26]], [[B26]] +; SSE-NEXT: [[R27:%.*]] = ashr i16 [[A27]], [[B27]] +; SSE-NEXT: [[R28:%.*]] = ashr i16 [[A28]], [[B28]] +; SSE-NEXT: [[R29:%.*]] = ashr i16 [[A29]], [[B29]] +; SSE-NEXT: [[R30:%.*]] = ashr i16 [[A30]], [[B30]] +; SSE-NEXT: [[R31:%.*]] = ashr i16 [[A31]], [[B31]] +; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 +; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @ashr_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @ashr_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @ashr_v32i16( +; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = ashr i16 %a0 , %b0 + %r1 = ashr i16 %a1 , %b1 + %r2 = ashr i16 %a2 , %b2 + %r3 = ashr i16 %a3 , %b3 + %r4 = ashr i16 %a4 , %b4 + %r5 = ashr i16 %a5 , %b5 + %r6 = ashr i16 %a6 , %b6 + %r7 = ashr i16 %a7 , %b7 + %r8 = ashr i16 %a8 , %b8 + %r9 = ashr i16 %a9 , %b9 + %r10 = ashr i16 %a10, %b10 + %r11 = ashr i16 %a11, %b11 + %r12 = ashr i16 %a12, %b12 + %r13 = ashr i16 %a13, %b13 + %r14 = ashr i16 %a14, %b14 + %r15 = ashr i16 %a15, %b15 + %r16 = ashr i16 %a16, %b16 + %r17 = ashr i16 %a17, %b17 + %r18 = ashr i16 %a18, %b18 + %r19 = ashr i16 %a19, %b19 + %r20 = ashr i16 %a20, %b20 + %r21 = ashr i16 %a21, %b21 + %r22 = ashr i16 %a22, %b22 + %r23 = ashr i16 %a23, %b23 + %r24 = ashr i16 %a24, %b24 + %r25 = ashr i16 %a25, %b25 + %r26 = ashr i16 %a26, %b26 + %r27 = ashr i16 %a27, %b27 + %r28 = ashr i16 %a28, %b28 + %r29 = ashr i16 %a29, %b29 + %r30 = ashr i16 %a30, %b30 + %r31 = ashr i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @ashr_v64i8() { +; CHECK-LABEL: @ashr_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = ashr i8 %a0 , %b0 + %r1 = ashr i8 %a1 , %b1 + %r2 = ashr i8 %a2 , %b2 + %r3 = ashr i8 %a3 , %b3 + %r4 = ashr i8 %a4 , %b4 + %r5 = ashr i8 %a5 , %b5 + %r6 = ashr i8 %a6 , %b6 + %r7 = ashr i8 %a7 , %b7 + %r8 = ashr i8 %a8 , %b8 + %r9 = ashr i8 %a9 , %b9 + %r10 = ashr i8 %a10, %b10 + %r11 = ashr i8 %a11, %b11 + %r12 = ashr i8 %a12, %b12 + %r13 = ashr i8 %a13, %b13 + %r14 = ashr i8 %a14, %b14 + %r15 = ashr i8 %a15, %b15 + %r16 = ashr i8 %a16, %b16 + %r17 = ashr i8 %a17, %b17 + %r18 = ashr i8 %a18, %b18 + %r19 = ashr i8 %a19, %b19 + %r20 = ashr i8 %a20, %b20 + %r21 = ashr i8 %a21, %b21 + %r22 = ashr i8 %a22, %b22 + %r23 = ashr i8 %a23, %b23 + %r24 = ashr i8 %a24, %b24 + %r25 = ashr i8 %a25, %b25 + %r26 = ashr i8 %a26, %b26 + %r27 = ashr i8 %a27, %b27 + %r28 = ashr i8 %a28, %b28 + %r29 = ashr i8 %a29, %b29 + %r30 = ashr i8 %a30, %b30 + %r31 = ashr i8 %a31, %b31 + %r32 = ashr i8 %a32, %b32 + %r33 = ashr i8 %a33, %b33 + %r34 = ashr i8 %a34, %b34 + %r35 = ashr i8 %a35, %b35 + %r36 = ashr i8 %a36, %b36 + %r37 = ashr i8 %a37, %b37 + %r38 = ashr i8 %a38, %b38 + %r39 = ashr i8 %a39, %b39 + %r40 = ashr i8 %a40, %b40 + %r41 = ashr i8 %a41, %b41 + %r42 = ashr i8 %a42, %b42 + %r43 = ashr i8 %a43, %b43 + %r44 = ashr i8 %a44, %b44 + %r45 = ashr i8 %a45, %b45 + %r46 = ashr i8 %a46, %b46 + %r47 = ashr i8 %a47, %b47 + %r48 = ashr i8 %a48, %b48 + %r49 = ashr i8 %a49, %b49 + %r50 = ashr i8 %a50, %b50 + %r51 = ashr i8 %a51, %b51 + %r52 = ashr i8 %a52, %b52 + %r53 = ashr i8 %a53, %b53 + %r54 = ashr i8 %a54, %b54 + %r55 = ashr i8 %a55, %b55 + %r56 = ashr i8 %a56, %b56 + %r57 = ashr i8 %a57, %b57 + %r58 = ashr i8 %a58, %b58 + %r59 = ashr i8 %a59, %b59 + %r60 = ashr i8 %a60, %b60 + %r61 = ashr i8 %a61, %b61 + %r62 = ashr i8 %a62, %b62 + %r63 = ashr i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll new file mode 100644 index 000000000000..6fd78e7c9699 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -0,0 +1,862 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @lshr_v8i64() { +; SSE-LABEL: @lshr_v8i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @lshr_v8i64( +; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] +; AVX1-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] +; AVX1-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] +; AVX1-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] +; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @lshr_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @lshr_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @lshr_v8i64( +; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = lshr i64 %a0, %b0 + %r1 = lshr i64 %a1, %b1 + %r2 = lshr i64 %a2, %b2 + %r3 = lshr i64 %a3, %b3 + %r4 = lshr i64 %a4, %b4 + %r5 = lshr i64 %a5, %b5 + %r6 = lshr i64 %a6, %b6 + %r7 = lshr i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @lshr_v16i32() { +; SSE-LABEL: @lshr_v16i32( +; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SSE-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]] +; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @lshr_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @lshr_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @lshr_v16i32( +; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = lshr i32 %a0 , %b0 + %r1 = lshr i32 %a1 , %b1 + %r2 = lshr i32 %a2 , %b2 + %r3 = lshr i32 %a3 , %b3 + %r4 = lshr i32 %a4 , %b4 + %r5 = lshr i32 %a5 , %b5 + %r6 = lshr i32 %a6 , %b6 + %r7 = lshr i32 %a7 , %b7 + %r8 = lshr i32 %a8 , %b8 + %r9 = lshr i32 %a9 , %b9 + %r10 = lshr i32 %a10, %b10 + %r11 = lshr i32 %a11, %b11 + %r12 = lshr i32 %a12, %b12 + %r13 = lshr i32 %a13, %b13 + %r14 = lshr i32 %a14, %b14 + %r15 = lshr i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @lshr_v32i16() { +; SSE-LABEL: @lshr_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = lshr i16 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = lshr i16 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = lshr i16 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = lshr i16 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = lshr i16 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = lshr i16 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = lshr i16 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = lshr i16 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = lshr i16 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = lshr i16 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = lshr i16 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = lshr i16 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = lshr i16 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = lshr i16 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = lshr i16 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = lshr i16 [[A15]], [[B15]] +; SSE-NEXT: [[R16:%.*]] = lshr i16 [[A16]], [[B16]] +; SSE-NEXT: [[R17:%.*]] = lshr i16 [[A17]], [[B17]] +; SSE-NEXT: [[R18:%.*]] = lshr i16 [[A18]], [[B18]] +; SSE-NEXT: [[R19:%.*]] = lshr i16 [[A19]], [[B19]] +; SSE-NEXT: [[R20:%.*]] = lshr i16 [[A20]], [[B20]] +; SSE-NEXT: [[R21:%.*]] = lshr i16 [[A21]], [[B21]] +; SSE-NEXT: [[R22:%.*]] = lshr i16 [[A22]], [[B22]] +; SSE-NEXT: [[R23:%.*]] = lshr i16 [[A23]], [[B23]] +; SSE-NEXT: [[R24:%.*]] = lshr i16 [[A24]], [[B24]] +; SSE-NEXT: [[R25:%.*]] = lshr i16 [[A25]], [[B25]] +; SSE-NEXT: [[R26:%.*]] = lshr i16 [[A26]], [[B26]] +; SSE-NEXT: [[R27:%.*]] = lshr i16 [[A27]], [[B27]] +; SSE-NEXT: [[R28:%.*]] = lshr i16 [[A28]], [[B28]] +; SSE-NEXT: [[R29:%.*]] = lshr i16 [[A29]], [[B29]] +; SSE-NEXT: [[R30:%.*]] = lshr i16 [[A30]], [[B30]] +; SSE-NEXT: [[R31:%.*]] = lshr i16 [[A31]], [[B31]] +; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 +; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @lshr_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @lshr_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @lshr_v32i16( +; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = lshr i16 %a0 , %b0 + %r1 = lshr i16 %a1 , %b1 + %r2 = lshr i16 %a2 , %b2 + %r3 = lshr i16 %a3 , %b3 + %r4 = lshr i16 %a4 , %b4 + %r5 = lshr i16 %a5 , %b5 + %r6 = lshr i16 %a6 , %b6 + %r7 = lshr i16 %a7 , %b7 + %r8 = lshr i16 %a8 , %b8 + %r9 = lshr i16 %a9 , %b9 + %r10 = lshr i16 %a10, %b10 + %r11 = lshr i16 %a11, %b11 + %r12 = lshr i16 %a12, %b12 + %r13 = lshr i16 %a13, %b13 + %r14 = lshr i16 %a14, %b14 + %r15 = lshr i16 %a15, %b15 + %r16 = lshr i16 %a16, %b16 + %r17 = lshr i16 %a17, %b17 + %r18 = lshr i16 %a18, %b18 + %r19 = lshr i16 %a19, %b19 + %r20 = lshr i16 %a20, %b20 + %r21 = lshr i16 %a21, %b21 + %r22 = lshr i16 %a22, %b22 + %r23 = lshr i16 %a23, %b23 + %r24 = lshr i16 %a24, %b24 + %r25 = lshr i16 %a25, %b25 + %r26 = lshr i16 %a26, %b26 + %r27 = lshr i16 %a27, %b27 + %r28 = lshr i16 %a28, %b28 + %r29 = lshr i16 %a29, %b29 + %r30 = lshr i16 %a30, %b30 + %r31 = lshr i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @lshr_v64i8() { +; CHECK-LABEL: @lshr_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = lshr i8 %a0 , %b0 + %r1 = lshr i8 %a1 , %b1 + %r2 = lshr i8 %a2 , %b2 + %r3 = lshr i8 %a3 , %b3 + %r4 = lshr i8 %a4 , %b4 + %r5 = lshr i8 %a5 , %b5 + %r6 = lshr i8 %a6 , %b6 + %r7 = lshr i8 %a7 , %b7 + %r8 = lshr i8 %a8 , %b8 + %r9 = lshr i8 %a9 , %b9 + %r10 = lshr i8 %a10, %b10 + %r11 = lshr i8 %a11, %b11 + %r12 = lshr i8 %a12, %b12 + %r13 = lshr i8 %a13, %b13 + %r14 = lshr i8 %a14, %b14 + %r15 = lshr i8 %a15, %b15 + %r16 = lshr i8 %a16, %b16 + %r17 = lshr i8 %a17, %b17 + %r18 = lshr i8 %a18, %b18 + %r19 = lshr i8 %a19, %b19 + %r20 = lshr i8 %a20, %b20 + %r21 = lshr i8 %a21, %b21 + %r22 = lshr i8 %a22, %b22 + %r23 = lshr i8 %a23, %b23 + %r24 = lshr i8 %a24, %b24 + %r25 = lshr i8 %a25, %b25 + %r26 = lshr i8 %a26, %b26 + %r27 = lshr i8 %a27, %b27 + %r28 = lshr i8 %a28, %b28 + %r29 = lshr i8 %a29, %b29 + %r30 = lshr i8 %a30, %b30 + %r31 = lshr i8 %a31, %b31 + %r32 = lshr i8 %a32, %b32 + %r33 = lshr i8 %a33, %b33 + %r34 = lshr i8 %a34, %b34 + %r35 = lshr i8 %a35, %b35 + %r36 = lshr i8 %a36, %b36 + %r37 = lshr i8 %a37, %b37 + %r38 = lshr i8 %a38, %b38 + %r39 = lshr i8 %a39, %b39 + %r40 = lshr i8 %a40, %b40 + %r41 = lshr i8 %a41, %b41 + %r42 = lshr i8 %a42, %b42 + %r43 = lshr i8 %a43, %b43 + %r44 = lshr i8 %a44, %b44 + %r45 = lshr i8 %a45, %b45 + %r46 = lshr i8 %a46, %b46 + %r47 = lshr i8 %a47, %b47 + %r48 = lshr i8 %a48, %b48 + %r49 = lshr i8 %a49, %b49 + %r50 = lshr i8 %a50, %b50 + %r51 = lshr i8 %a51, %b51 + %r52 = lshr i8 %a52, %b52 + %r53 = lshr i8 %a53, %b53 + %r54 = lshr i8 %a54, %b54 + %r55 = lshr i8 %a55, %b55 + %r56 = lshr i8 %a56, %b56 + %r57 = lshr i8 %a57, %b57 + %r58 = lshr i8 %a58, %b58 + %r59 = lshr i8 %a59, %b59 + %r60 = lshr i8 %a60, %b60 + %r61 = lshr i8 %a61, %b61 + %r62 = lshr i8 %a62, %b62 + %r63 = lshr i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/test/Transforms/SLPVectorizer/X86/shift-shl.ll new file mode 100644 index 000000000000..70de82bdea5f --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -0,0 +1,814 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP + +@a64 = common global [8 x i64] zeroinitializer, align 64 +@b64 = common global [8 x i64] zeroinitializer, align 64 +@c64 = common global [8 x i64] zeroinitializer, align 64 +@a32 = common global [16 x i32] zeroinitializer, align 64 +@b32 = common global [16 x i32] zeroinitializer, align 64 +@c32 = common global [16 x i32] zeroinitializer, align 64 +@a16 = common global [32 x i16] zeroinitializer, align 64 +@b16 = common global [32 x i16] zeroinitializer, align 64 +@c16 = common global [32 x i16] zeroinitializer, align 64 +@a8 = common global [64 x i8] zeroinitializer, align 64 +@b8 = common global [64 x i8] zeroinitializer, align 64 +@c8 = common global [64 x i8] zeroinitializer, align 64 + +define void @shl_v8i64() { +; SSE-LABEL: @shl_v8i64( +; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @shl_v8i64( +; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] +; AVX1-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] +; AVX1-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] +; AVX1-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] +; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @shl_v8i64( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] +; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @shl_v8i64( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @shl_v8i64( +; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: ret void +; + %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8 + %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8 + %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8 + %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8 + %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8 + %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8 + %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8 + %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8 + %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8 + %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8 + %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8 + %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8 + %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8 + %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8 + %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8 + %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8 + %r0 = shl i64 %a0, %b0 + %r1 = shl i64 %a1, %b1 + %r2 = shl i64 %a2, %b2 + %r3 = shl i64 %a3, %b3 + %r4 = shl i64 %a4, %b4 + %r5 = shl i64 %a5, %b5 + %r6 = shl i64 %a6, %b6 + %r7 = shl i64 %a7, %b7 + store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8 + store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8 + store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8 + store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8 + store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8 + store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8 + store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8 + store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8 + ret void +} + +define void @shl_v16i32() { +; SSE-LABEL: @shl_v16i32( +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]] +; SSE-NEXT: [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]] +; SSE-NEXT: [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @shl_v16i32( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @shl_v16i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4 +; AVX512-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]] +; AVX512-NEXT: store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @shl_v16i32( +; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: ret void +; + %a0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4 + %a1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4 + %a2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4 + %a3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4 + %a4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4 + %a5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4 + %a6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4 + %a7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4 + %a8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4 + %a9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4 + %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 + %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 + %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 + %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 + %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 + %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 + %b0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4 + %b1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4 + %b2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4 + %b3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4 + %b4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4 + %b5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4 + %b6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4 + %b7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4 + %b8 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4 + %b9 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4 + %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 + %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 + %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 + %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 + %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 + %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 + %r0 = shl i32 %a0 , %b0 + %r1 = shl i32 %a1 , %b1 + %r2 = shl i32 %a2 , %b2 + %r3 = shl i32 %a3 , %b3 + %r4 = shl i32 %a4 , %b4 + %r5 = shl i32 %a5 , %b5 + %r6 = shl i32 %a6 , %b6 + %r7 = shl i32 %a7 , %b7 + %r8 = shl i32 %a8 , %b8 + %r9 = shl i32 %a9 , %b9 + %r10 = shl i32 %a10, %b10 + %r11 = shl i32 %a11, %b11 + %r12 = shl i32 %a12, %b12 + %r13 = shl i32 %a13, %b13 + %r14 = shl i32 %a14, %b14 + %r15 = shl i32 %a15, %b15 + store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4 + store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4 + store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4 + store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4 + store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4 + store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4 + store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4 + store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4 + store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4 + store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4 + store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 + store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 + store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 + store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 + store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 + store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 + ret void +} + +define void @shl_v32i16() { +; SSE-LABEL: @shl_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]] +; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]] +; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]] +; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]] +; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]] +; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]] +; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]] +; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]] +; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]] +; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]] +; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]] +; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]] +; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]] +; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]] +; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]] +; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]] +; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]] +; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 +; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void +; +; AVX-LABEL: @shl_v32i16( +; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]] +; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: ret void +; +; AVX512-LABEL: @shl_v32i16( +; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]] +; AVX512-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX512-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX512-NEXT: ret void +; +; XOP-LABEL: @shl_v32i16( +; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]] +; XOP-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]] +; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: ret void +; + %a0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2 + %a1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2 + %a2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2 + %a3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2 + %a4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2 + %a5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2 + %a6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2 + %a7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2 + %a8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2 + %a9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2 + %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 + %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 + %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 + %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 + %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 + %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 + %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 + %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 + %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 + %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 + %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 + %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 + %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 + %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 + %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 + %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 + %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 + %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 + %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 + %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 + %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 + %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 + %b0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2 + %b1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2 + %b2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2 + %b3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2 + %b4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2 + %b5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2 + %b6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2 + %b7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2 + %b8 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2 + %b9 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2 + %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 + %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 + %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 + %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 + %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 + %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 + %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 + %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 + %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 + %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 + %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 + %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 + %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 + %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 + %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 + %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 + %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 + %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 + %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 + %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 + %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 + %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 + %r0 = shl i16 %a0 , %b0 + %r1 = shl i16 %a1 , %b1 + %r2 = shl i16 %a2 , %b2 + %r3 = shl i16 %a3 , %b3 + %r4 = shl i16 %a4 , %b4 + %r5 = shl i16 %a5 , %b5 + %r6 = shl i16 %a6 , %b6 + %r7 = shl i16 %a7 , %b7 + %r8 = shl i16 %a8 , %b8 + %r9 = shl i16 %a9 , %b9 + %r10 = shl i16 %a10, %b10 + %r11 = shl i16 %a11, %b11 + %r12 = shl i16 %a12, %b12 + %r13 = shl i16 %a13, %b13 + %r14 = shl i16 %a14, %b14 + %r15 = shl i16 %a15, %b15 + %r16 = shl i16 %a16, %b16 + %r17 = shl i16 %a17, %b17 + %r18 = shl i16 %a18, %b18 + %r19 = shl i16 %a19, %b19 + %r20 = shl i16 %a20, %b20 + %r21 = shl i16 %a21, %b21 + %r22 = shl i16 %a22, %b22 + %r23 = shl i16 %a23, %b23 + %r24 = shl i16 %a24, %b24 + %r25 = shl i16 %a25, %b25 + %r26 = shl i16 %a26, %b26 + %r27 = shl i16 %a27, %b27 + %r28 = shl i16 %a28, %b28 + %r29 = shl i16 %a29, %b29 + %r30 = shl i16 %a30, %b30 + %r31 = shl i16 %a31, %b31 + store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2 + store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2 + store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2 + store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2 + store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2 + store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2 + store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2 + store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2 + store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2 + store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2 + store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 + store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 + store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 + store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 + store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 + store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 + store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 + store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 + store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 + store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 + store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 + store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 + store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 + store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 + store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 + store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 + store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 + store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 + store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 + store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 + store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 + store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 + ret void +} + +define void @shl_v64i8() { +; CHECK-LABEL: @shl_v64i8( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]] +; CHECK-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 +; CHECK-NEXT: ret void +; + %a0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1 + %a1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1 + %a2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1 + %a3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1 + %a4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1 + %a5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1 + %a6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1 + %a7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1 + %a8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1 + %a9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1 + %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1 + %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1 + %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1 + %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1 + %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1 + %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1 + %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1 + %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1 + %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1 + %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1 + %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1 + %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1 + %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1 + %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1 + %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1 + %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1 + %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1 + %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1 + %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1 + %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1 + %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1 + %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1 + %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1 + %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1 + %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1 + %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1 + %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1 + %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1 + %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1 + %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1 + %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1 + %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1 + %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1 + %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1 + %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1 + %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1 + %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1 + %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1 + %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1 + %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1 + %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1 + %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1 + %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1 + %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1 + %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1 + %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1 + %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1 + %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1 + %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1 + %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1 + %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1 + %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1 + %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1 + %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1 + %b0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1 + %b1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1 + %b2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1 + %b3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1 + %b4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1 + %b5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1 + %b6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1 + %b7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1 + %b8 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1 + %b9 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1 + %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1 + %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1 + %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1 + %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1 + %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1 + %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1 + %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1 + %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1 + %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1 + %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1 + %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1 + %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1 + %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1 + %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1 + %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1 + %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1 + %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1 + %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1 + %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1 + %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1 + %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1 + %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1 + %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1 + %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1 + %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1 + %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1 + %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1 + %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1 + %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1 + %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1 + %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1 + %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1 + %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1 + %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1 + %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1 + %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1 + %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1 + %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1 + %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1 + %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1 + %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1 + %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1 + %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1 + %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1 + %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1 + %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1 + %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1 + %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1 + %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1 + %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1 + %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1 + %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1 + %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1 + %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1 + %r0 = shl i8 %a0 , %b0 + %r1 = shl i8 %a1 , %b1 + %r2 = shl i8 %a2 , %b2 + %r3 = shl i8 %a3 , %b3 + %r4 = shl i8 %a4 , %b4 + %r5 = shl i8 %a5 , %b5 + %r6 = shl i8 %a6 , %b6 + %r7 = shl i8 %a7 , %b7 + %r8 = shl i8 %a8 , %b8 + %r9 = shl i8 %a9 , %b9 + %r10 = shl i8 %a10, %b10 + %r11 = shl i8 %a11, %b11 + %r12 = shl i8 %a12, %b12 + %r13 = shl i8 %a13, %b13 + %r14 = shl i8 %a14, %b14 + %r15 = shl i8 %a15, %b15 + %r16 = shl i8 %a16, %b16 + %r17 = shl i8 %a17, %b17 + %r18 = shl i8 %a18, %b18 + %r19 = shl i8 %a19, %b19 + %r20 = shl i8 %a20, %b20 + %r21 = shl i8 %a21, %b21 + %r22 = shl i8 %a22, %b22 + %r23 = shl i8 %a23, %b23 + %r24 = shl i8 %a24, %b24 + %r25 = shl i8 %a25, %b25 + %r26 = shl i8 %a26, %b26 + %r27 = shl i8 %a27, %b27 + %r28 = shl i8 %a28, %b28 + %r29 = shl i8 %a29, %b29 + %r30 = shl i8 %a30, %b30 + %r31 = shl i8 %a31, %b31 + %r32 = shl i8 %a32, %b32 + %r33 = shl i8 %a33, %b33 + %r34 = shl i8 %a34, %b34 + %r35 = shl i8 %a35, %b35 + %r36 = shl i8 %a36, %b36 + %r37 = shl i8 %a37, %b37 + %r38 = shl i8 %a38, %b38 + %r39 = shl i8 %a39, %b39 + %r40 = shl i8 %a40, %b40 + %r41 = shl i8 %a41, %b41 + %r42 = shl i8 %a42, %b42 + %r43 = shl i8 %a43, %b43 + %r44 = shl i8 %a44, %b44 + %r45 = shl i8 %a45, %b45 + %r46 = shl i8 %a46, %b46 + %r47 = shl i8 %a47, %b47 + %r48 = shl i8 %a48, %b48 + %r49 = shl i8 %a49, %b49 + %r50 = shl i8 %a50, %b50 + %r51 = shl i8 %a51, %b51 + %r52 = shl i8 %a52, %b52 + %r53 = shl i8 %a53, %b53 + %r54 = shl i8 %a54, %b54 + %r55 = shl i8 %a55, %b55 + %r56 = shl i8 %a56, %b56 + %r57 = shl i8 %a57, %b57 + %r58 = shl i8 %a58, %b58 + %r59 = shl i8 %a59, %b59 + %r60 = shl i8 %a60, %b60 + %r61 = shl i8 %a61, %b61 + %r62 = shl i8 %a62, %b62 + %r63 = shl i8 %a63, %b63 + store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1 + store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1 + store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1 + store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1 + store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1 + store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1 + store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1 + store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1 + store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1 + store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1 + store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1 + store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1 + store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1 + store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1 + store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1 + store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1 + store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1 + store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1 + store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1 + store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1 + store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1 + store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1 + store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1 + store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1 + store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1 + store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1 + store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1 + store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1 + store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1 + store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1 + store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1 + store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1 + store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1 + store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1 + store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1 + store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1 + store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1 + store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1 + store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1 + store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1 + store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1 + store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1 + store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1 + store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1 + store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1 + store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1 + store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1 + store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1 + store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1 + store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1 + store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1 + store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1 + store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1 + store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1 + store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1 + store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1 + store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1 + store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1 + store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1 + store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1 + store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1 + store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1 + store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1 + store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1 + ret void +} diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll index 42b4f3dea75b..3ac3c5138ae7 100644 --- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll +++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll @@ -183,3 +183,202 @@ loop_exit3: ; CHECK: [[UNREACHABLE]]: ; CHECK-NEXT: unreachable } + +; This test contains a trivially unswitchable branch with an LCSSA phi node in +; a loop exit block. +define i32 @test5(i1 %cond1, i32 %x, i32 %y) { +; CHECK-LABEL: @test5( +entry: + br label %loop_begin +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 %{{.*}}, label %entry.split, label %loop_exit +; +; CHECK: entry.split: +; CHECK-NEXT: br label %loop_begin + +loop_begin: + br i1 %cond1, label %latch, label %loop_exit +; CHECK: loop_begin: +; CHECK-NEXT: br label %latch + +latch: + call void @some_func() noreturn nounwind + br label %loop_begin +; CHECK: latch: +; CHECK-NEXT: call +; CHECK-NEXT: br label %loop_begin + +loop_exit: + %result1 = phi i32 [ %x, %loop_begin ] + %result2 = phi i32 [ %y, %loop_begin ] + %result = add i32 %result1, %result2 + ret i32 %result +; CHECK: loop_exit: +; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ] +; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ] +; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]] +; CHECK-NEXT: ret i32 %[[R]] +} + +; This test contains a trivially unswitchable branch with a real phi node in LCSSA +; position in a shared exit block where a different path through the loop +; produces a non-invariant input to the PHI node. +define i32 @test6(i32* %var, i1 %cond1, i1 %cond2, i32 %x, i32 %y) { +; CHECK-LABEL: @test6( +entry: + br label %loop_begin +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 %{{.*}}, label %entry.split, label %loop_exit.split +; +; CHECK: entry.split: +; CHECK-NEXT: br label %loop_begin + +loop_begin: + br i1 %cond1, label %continue, label %loop_exit +; CHECK: loop_begin: +; CHECK-NEXT: br label %continue + +continue: + %var_val = load i32, i32* %var + br i1 %cond2, label %latch, label %loop_exit +; CHECK: continue: +; CHECK-NEXT: load +; CHECK-NEXT: br i1 %cond2, label %latch, label %loop_exit + +latch: + call void @some_func() noreturn nounwind + br label %loop_begin +; CHECK: latch: +; CHECK-NEXT: call +; CHECK-NEXT: br label %loop_begin + +loop_exit: + %result1 = phi i32 [ %x, %loop_begin ], [ %var_val, %continue ] + %result2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ] + %result = add i32 %result1, %result2 + ret i32 %result +; CHECK: loop_exit: +; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %var_val, %continue ] +; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %var_val, %continue ] +; CHECK-NEXT: br label %loop_exit.split +; +; CHECK: loop_exit.split: +; CHECK-NEXT: %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %[[R1]], %loop_exit ] +; CHECK-NEXT: %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %[[R2]], %loop_exit ] +; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1S]], %[[R2S]] +; CHECK-NEXT: ret i32 %[[R]] +} + +; This test contains a trivially unswitchable switch with an LCSSA phi node in +; a loop exit block. +define i32 @test7(i32 %cond1, i32 %x, i32 %y) { +; CHECK-LABEL: @test7( +entry: + br label %loop_begin +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i32 %cond1, label %entry.split [ +; CHECK-NEXT: i32 0, label %loop_exit +; CHECK-NEXT: i32 1, label %loop_exit +; CHECK-NEXT: ] +; +; CHECK: entry.split: +; CHECK-NEXT: br label %loop_begin + +loop_begin: + switch i32 %cond1, label %latch [ + i32 0, label %loop_exit + i32 1, label %loop_exit + ] +; CHECK: loop_begin: +; CHECK-NEXT: br label %latch + +latch: + call void @some_func() noreturn nounwind + br label %loop_begin +; CHECK: latch: +; CHECK-NEXT: call +; CHECK-NEXT: br label %loop_begin + +loop_exit: + %result1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ] + %result2 = phi i32 [ %y, %loop_begin ], [ %y, %loop_begin ] + %result = add i32 %result1, %result2 + ret i32 %result +; CHECK: loop_exit: +; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ] +; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ] +; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]] +; CHECK-NEXT: ret i32 %[[R]] +} + +; This test contains a trivially unswitchable switch with a real phi node in +; LCSSA position in a shared exit block where a different path through the loop +; produces a non-invariant input to the PHI node. +define i32 @test8(i32* %var, i32 %cond1, i32 %cond2, i32 %x, i32 %y) { +; CHECK-LABEL: @test8( +entry: + br label %loop_begin +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i32 %cond1, label %entry.split [ +; CHECK-NEXT: i32 0, label %loop_exit.split +; CHECK-NEXT: i32 1, label %loop_exit2 +; CHECK-NEXT: i32 2, label %loop_exit.split +; CHECK-NEXT: ] +; +; CHECK: entry.split: +; CHECK-NEXT: br label %loop_begin + +loop_begin: + switch i32 %cond1, label %continue [ + i32 0, label %loop_exit + i32 1, label %loop_exit2 + i32 2, label %loop_exit + ] +; CHECK: loop_begin: +; CHECK-NEXT: br label %continue + +continue: + %var_val = load i32, i32* %var + switch i32 %cond2, label %latch [ + i32 0, label %loop_exit + ] +; CHECK: continue: +; CHECK-NEXT: load +; CHECK-NEXT: switch i32 %cond2, label %latch [ +; CHECK-NEXT: i32 0, label %loop_exit +; CHECK-NEXT: ] + +latch: + call void @some_func() noreturn nounwind + br label %loop_begin +; CHECK: latch: +; CHECK-NEXT: call +; CHECK-NEXT: br label %loop_begin + +loop_exit: + %result1.1 = phi i32 [ %x, %loop_begin ], [ %x, %loop_begin ], [ %var_val, %continue ] + %result1.2 = phi i32 [ %var_val, %continue ], [ %y, %loop_begin ], [ %y, %loop_begin ] + %result1 = add i32 %result1.1, %result1.2 + ret i32 %result1 +; CHECK: loop_exit: +; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %var_val, %continue ] +; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %var_val, %continue ] +; CHECK-NEXT: br label %loop_exit.split +; +; CHECK: loop_exit.split: +; CHECK-NEXT: %[[R1S:.*]] = phi i32 [ %x, %entry ], [ %x, %entry ], [ %[[R1]], %loop_exit ] +; CHECK-NEXT: %[[R2S:.*]] = phi i32 [ %y, %entry ], [ %y, %entry ], [ %[[R2]], %loop_exit ] +; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1S]], %[[R2S]] +; CHECK-NEXT: ret i32 %[[R]] + +loop_exit2: + %result2.1 = phi i32 [ %x, %loop_begin ] + %result2.2 = phi i32 [ %y, %loop_begin ] + %result2 = add i32 %result2.1, %result2.2 + ret i32 %result2 +; CHECK: loop_exit2: +; CHECK-NEXT: %[[R1:.*]] = phi i32 [ %x, %entry ] +; CHECK-NEXT: %[[R2:.*]] = phi i32 [ %y, %entry ] +; CHECK-NEXT: %[[R:.*]] = add i32 %[[R1]], %[[R2]] +; CHECK-NEXT: ret i32 %[[R]] +} diff --git a/test/Transforms/SpeculativeExecution/spec-other.ll b/test/Transforms/SpeculativeExecution/spec-other.ll deleted file mode 100644 index 65e14b69e9e6..000000000000 --- a/test/Transforms/SpeculativeExecution/spec-other.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: opt < %s -S -speculative-execution \ -; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \ -; RUN: | FileCheck %s - -; CHECK-LABEL: @ifThen_extractvalue( -; CHECK: extractvalue -; CHECK: br i1 true -define void @ifThen_extractvalue() { - br i1 true, label %a, label %b - -a: - %x = extractvalue { i32, i32 } undef, 0 - br label %b - -b: - ret void -} - -; CHECK-LABEL: @ifThen_insertvalue( -; CHECK: insertvalue -; CHECK: br i1 true -define void @ifThen_insertvalue() { - br i1 true, label %a, label %b - -a: - %x = insertvalue { i32, i32 } undef, i32 undef, 0 - br label %b - -b: - ret void -} - diff --git a/test/Transforms/SpeculativeExecution/spec-vector.ll b/test/Transforms/SpeculativeExecution/spec-vector.ll deleted file mode 100644 index 9c64f1fb1005..000000000000 --- a/test/Transforms/SpeculativeExecution/spec-vector.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: opt < %s -S -speculative-execution \ -; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \ -; RUN: | FileCheck %s - -; CHECK-LABEL: @ifThen_extractelement_constindex( -; CHECK: extractelement -; CHECK: br i1 true -define void @ifThen_extractelement_constindex() { - br i1 true, label %a, label %b - -a: - %x = extractelement <4 x i32> undef, i32 0 - br label %b - -b: - ret void -} - -; CHECK-LABEL: @ifThen_extractelement_varindex( -; CHECK: extractelement -; CHECK: br i1 true -define void @ifThen_extractelement_varindex(i32 %idx) { - br i1 true, label %a, label %b - -a: - %x = extractelement <4 x i32> undef, i32 %idx - br label %b - -b: - ret void -} - -; CHECK-LABEL: @ifThen_insertelement_constindex( -; CHECK: insertelement -; CHECK: br i1 true -define void @ifThen_insertelement_constindex() { - br i1 true, label %a, label %b - -a: - %x = insertelement <4 x i32> undef, i32 undef, i32 0 - br label %b - -b: - ret void -} - -; CHECK-LABEL: @ifThen_insertelement_varindex( -; CHECK: insertelement -; CHECK: br i1 true -define void @ifThen_insertelement_varindex(i32 %idx) { - br i1 true, label %a, label %b - -a: - %x = insertelement <4 x i32> undef, i32 undef, i32 %idx - br label %b - -b: - ret void -} - -; CHECK-LABEL: @ifThen_shufflevector( -; CHECK: shufflevector -; CHECK: br i1 true -define void @ifThen_shufflevector() { - br i1 true, label %a, label %b - -a: - %x = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> undef - br label %b - -b: - ret void -} diff --git a/test/Transforms/Util/split-bit-piece.ll b/test/Transforms/Util/split-bit-piece.ll index 3d7bcac73ca3..5a374e839926 100644 --- a/test/Transforms/Util/split-bit-piece.ll +++ b/test/Transforms/Util/split-bit-piece.ll @@ -3,43 +3,85 @@ ; if it only describes part of the variable. ; RUN: opt -S -sroa %s | FileCheck %s -; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 +; Built from: +; struct foo { bool b; long i; }; +; void f(bool b, bool expr, foo g) { +; } +; And modifying the frag dbg.declare to use a fragmented DIExpression (with offset: 0, size: 4) +; to test the dbg.declare+fragment case here. -; Function Attrs: nounwind uwtable -define hidden void @_ZN6__tsan9FastState14SetHistorySizeEi(i32 %hs) #1 align 2 { +; Expect two fragments: +; * first starting at bit 0, 8 bits (for the bool) +; * second starting at bit 32, 32 bits (for the long) +; (this happens to create/demonstrate a gap from bits [7, 32)) + +; But also check that a complex expression is not used for a lone bool +; parameter. It can reference the register it's in directly without masking off +; high bits or anything + +; CHECK: call void @llvm.dbg.value(metadata i8 %g.coerce0, i64 0, metadata ![[VAR_STRUCT:[0-9]+]], metadata ![[EXPR_STRUCT1:[0-9]+]]) +; CHECK: call void @llvm.dbg.value(metadata i64 %g.coerce1, i64 0, metadata ![[VAR_STRUCT]], metadata ![[EXPR_STRUCT2:[0-9]+]]) +; CHECK: call void @llvm.dbg.value(metadata i1 %b, i64 0, metadata ![[VAR_BOOL:[0-9]+]], metadata ![[EXPR_BOOL:[0-9]+]]) +; CHECK: call void @llvm.dbg.value(metadata i1 %frag, i64 0, metadata ![[FRAG_BOOL:[0-9]+]], metadata ![[FRAG_BOOL:[0-9]+]]) +; CHECK: ![[EXPR_STRUCT1]] = !DIExpression(DW_OP_LLVM_fragment, 0, 8) +; CHECK: ![[EXPR_STRUCT2]] = !DIExpression(DW_OP_LLVM_fragment, 32, 64) +; CHECK: ![[EXPR_BOOL]] = !DIExpression() +; CHECK: ![[FRAG_BOOL]] = !DIExpression(DW_OP_LLVM_fragment, 0, 1) + +%struct.foo = type { i8, i64 } + +; Function Attrs: noinline nounwind uwtable +define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 { entry: - %hs.addr = alloca i32, align 4 - %v1 = alloca i64, align 8 - %v2 = alloca i64, align 8 - store i32 %hs, i32* %hs.addr, align 4 -; CHECK: call void @llvm.dbg.value(metadata i32 %hs, i64 0, metadata !{{[0-9]+}}, metadata ![[EXPR:[0-9]+]]) -; CHECK: ![[EXPR]] = !DIExpression(DW_OP_LLVM_fragment, 0 - call void @llvm.dbg.declare(metadata i64* %v1, metadata !9, metadata !12), !dbg !13 - %0 = load i32, i32* %hs.addr, align 4 - %conv = sext i32 %0 to i64 - store i64 %conv, i64* %v1, align 8 - %1 = load i64, i64* %v2, align 8 - unreachable + %g = alloca %struct.foo, align 8 + %b.addr = alloca i8, align 1 + %frag.addr = alloca i8, align 1 + %0 = bitcast %struct.foo* %g to { i8, i64 }* + %1 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 0 + store i8 %g.coerce0, i8* %1, align 8 + %2 = getelementptr inbounds { i8, i64 }, { i8, i64 }* %0, i32 0, i32 1 + store i64 %g.coerce1, i64* %2, align 8 + %frombool = zext i1 %b to i8 + store i8 %frombool, i8* %b.addr, align 1 + call void @llvm.dbg.declare(metadata i8* %b.addr, metadata !15, metadata !16), !dbg !17 + %frombool1 = zext i1 %frag to i8 + store i8 %frombool1, i8* %frag.addr, align 1 + call void @llvm.dbg.declare(metadata i8* %frag.addr, metadata !18, metadata !23), !dbg !19 + call void @llvm.dbg.declare(metadata %struct.foo* %g, metadata !20, metadata !16), !dbg !21 + ret void, !dbg !22 } -attributes #0 = { nounwind readnone } +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable } !llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!7} -!llvm.ident = !{!8} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2) -!1 = !DIFile(filename: "tsan_shadow_test.cc", directory: "/tmp") -!2 = !{!3, !5} -!3 = !DICompositeType(tag: DW_TAG_class_type, name: "FastState", file: !4, line: 91, size: 64, align: 64, identifier: "_ZTSN6__tsan9FastStateE") -!4 = !DIFile(filename: "/mnt/extra/llvm/projects/compiler-rt/lib/tsan/rtl/tsan_rtl.h", directory: "/tmp") -!5 = distinct !DIDerivedType(tag: DW_TAG_typedef, name: "u64", line: 78, baseType: !6) -!6 = !DIBasicType(name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) -!7 = !{i32 2, !"Debug Info Version", i32 3} -!8 = !{!"clang version 3.8.0 (trunk 256979) (llvm/trunk 257107)"} -!9 = !DILocalVariable(name: "v1", scope: !10, file: !4, line: 136, type: !5) -!10 = distinct !DILexicalBlock(scope: !11, file: !4, line: 136, column: 5) -!11 = distinct !DISubprogram(name: "SetHistorySize", linkageName: "_ZN6__tsan9FastState14SetHistorySizeEi", scope: !3, file: !4, line: 135, isLocal: false, isDefinition: true, scopeLine: 135, flags: DIFlagPrototyped, isOptimized: false, unit: !0) -!12 = !DIExpression() -!13 = !DILocation(line: 136, column: 5, scope: !10) +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "foo.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!"clang version 5.0.0 (trunk 303077) (llvm/trunk 303098)"} +!6 = distinct !DISubprogram(name: "f", linkageName: "_Z1fbb3foo", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!7 = !DISubroutineType(types: !8) +!8 = !{null, !9, !9, !10} +!9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) +!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 1, size: 128, elements: !11, identifier: "_ZTS3foo") +!11 = !{!12, !13} +!12 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !1, line: 1, baseType: !9, size: 8) +!13 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !10, file: !1, line: 1, baseType: !14, size: 64, offset: 64) +!14 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +!15 = !DILocalVariable(name: "b", arg: 1, scope: !6, file: !1, line: 2, type: !9) +!16 = !DIExpression() +!17 = !DILocation(line: 2, column: 13, scope: !6) +!18 = !DILocalVariable(name: "frag", arg: 2, scope: !6, file: !1, line: 2, type: !9) +!19 = !DILocation(line: 2, column: 21, scope: !6) +!20 = !DILocalVariable(name: "g", arg: 3, scope: !6, file: !1, line: 2, type: !10) +!21 = !DILocation(line: 2, column: 31, scope: !6) +!22 = !DILocation(line: 3, column: 1, scope: !6) +!23 = !DIExpression(DW_OP_LLVM_fragment, 0, 4) |