diff options
Diffstat (limited to 'test')
79 files changed, 3455 insertions, 3037 deletions
diff --git a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll index 2e162f0f0005..9e706d62f8fc 100644 --- a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll +++ b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll @@ -207,7 +207,7 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) { ret <8 x i16> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16': -; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector @@ -219,7 +219,7 @@ define <8 x i16> @test_v8i16_2(<8 x i16> %a, <8 x i16> %b) { ret <8 x i16> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16_2': -; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector ; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector @@ -280,11 +280,11 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) { ret <16 x i8> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8': -; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) { @@ -292,11 +292,11 @@ define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) { ret <16 x i8> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8_2': -; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector -; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { @@ -304,10 +304,10 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ret <16 x i16> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16': -; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector @@ -316,10 +316,10 @@ define <16 x i16> @test_v16i16_2(<16 x i16> %a, <16 x i16> %b) { ret <16 x i16> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16_2': -; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector ; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) { @@ -327,11 +327,11 @@ define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) { ret <32 x i8> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8': -; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector -; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector -; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) { @@ -339,9 +339,9 @@ define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) { ret <32 x i8> %1 } ; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8_2': -; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector +; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector ; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector -; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector -; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector -; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector +; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector +; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector +; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector diff --git a/test/Analysis/RegionInfo/bad_node_traversal.ll b/test/Analysis/RegionInfo/bad_node_traversal.ll new file mode 100644 index 000000000000..00dd1207af9f --- /dev/null +++ b/test/Analysis/RegionInfo/bad_node_traversal.ll @@ -0,0 +1,43 @@ +; REQUIRES: asserts +; RUN: opt -regions -analyze < %s | FileCheck %s + +; While working on improvements to the region info analysis, this test +; case caused an incorrect region 3 => 8 to be detected. + +define internal i8 @wibble() { +bb: + br i1 true, label %bb1, label %bb8 + +bb1: ; preds = %bb + switch i32 0, label %bb2 [ + i32 0, label %bb3 + i32 1, label %bb7 + ] + +bb2: ; preds = %bb1 + br label %bb4 + +bb3: ; preds = %bb1 + br label %bb5 + +bb4: ; preds = %bb2 + br label %bb6 + +bb5: ; preds = %bb3 + br label %bb6 + +bb6: ; preds = %bb5, %bb4 + br label %bb7 + +bb7: ; preds = %bb6, %bb1 + br label %bb8 + +bb8: ; preds = %bb7, %bb + ret i8 1 +} + +; CHECK: [0] bb => <Function Return> +; CHECK-NEXT: [1] bb => bb8 +; CHECK-NEXT: [2] bb1 => bb7 +; CHECK-NEXT: End region tree + diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll index 0424a0e42a36..0bb0488b131f 100644 --- a/test/Bitcode/DIGlobalVariableExpression.ll +++ b/test/Bitcode/DIGlobalVariableExpression.ll @@ -1,5 +1,8 @@ ; RUN: llvm-dis -o - %s.bc | FileCheck %s +; RUN: llvm-dis -o - %s.bc | llvm-as - | llvm-bcanalyzer -dump - | FileCheck %s --check-prefix=BC +; BC: GLOBAL_VAR_EXPR +; BC: GLOBAL_DECL_ATTACHMENT ; CHECK: @g = common global i32 0, align 4, !dbg ![[G:[0-9]+]] ; CHECK: @h = common global i32 0, align 4, !dbg ![[H:[0-9]+]] ; CHECK: ![[G]] = {{.*}}!DIGlobalVariableExpression(var: ![[GVAR:[0-9]+]], expr: ![[GEXPR:[0-9]+]]) diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll index ae77f7e099db..412651c55678 100644 --- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefix=FALKOR --check-prefix=ALL ; rdar://11481771 ; rdar://13713797 @@ -16,6 +17,10 @@ entry: ; KRYO: movi v1.2d, #0000000000000000 ; KRYO: movi v2.2d, #0000000000000000 ; KRYO: movi v3.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 +; FALKOR: movi v1.2d, #0000000000000000 +; FALKOR: movi v2.2d, #0000000000000000 +; FALKOR: movi v3.2d, #0000000000000000 tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind ret void } @@ -47,6 +52,8 @@ define void @t4() nounwind ssp { ; CYCLONE: movi.2d v1, #0000000000000000 ; KRYO: movi v0.2d, #0000000000000000 ; KRYO: movi v1.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 +; FALKOR: movi v1.2d, #0000000000000000 tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind ret void } diff --git a/test/CodeGen/AArch64/store_merge_pair_offset.ll b/test/CodeGen/AArch64/store_merge_pair_offset.ll new file mode 100644 index 000000000000..a091f0fd911c --- /dev/null +++ b/test/CodeGen/AArch64/store_merge_pair_offset.ll @@ -0,0 +1,12 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -enable-misched=false -enable-post-misched=false -o - %s | FileCheck %s + +define i64 @test(i64* %a) nounwind { + ; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}} + ; CHECK-NOT: ldr + %p1 = getelementptr inbounds i64, i64* %a, i32 64 + %tmp1 = load i64, i64* %p1, align 2 + %p2 = getelementptr inbounds i64, i64* %a, i32 63 + %tmp2 = load i64, i64* %p2, align 2 + %tmp3 = add i64 %tmp1, %tmp2 + ret i64 %tmp3 +} diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll new file mode 100644 index 000000000000..8d8885852afe --- /dev/null +++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}main: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) +; GCN-NEXT: s_endpgm + +define amdgpu_gs void @main(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}main_halt: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT) +; GCN-NEXT: s_endpgm + +define void @main_halt(i32 inreg %a) #0 { + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a) + ret void +} + +; GCN-LABEL: {{^}}legacy: +; GCN: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) +; GCN-NEXT: s_endpgm + +define amdgpu_gs void @legacy(i32 inreg %a) #0 { + call void @llvm.SI.sendmsg(i32 3, i32 %a) + ret void +} + +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 +declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll new file mode 100644 index 000000000000..31f9cfca6def --- /dev/null +++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll @@ -0,0 +1,161 @@ +;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_interrupt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) +define void @test_interrupt() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_gs_emit() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_gs_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0); + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_gs_emit_cut() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_done: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_gs_done() { +body: + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) + ret void +} + + +; CHECK-LABEL: {{^}}test_interrupt_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT) +define void @test_interrupt_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_gs_emit_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_cut_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_gs_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_emit_cut_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_gs_emit_cut_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_gs_done_halt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_gs_done_halt() { +body: + call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0) + ret void +} + +; Legacy +; CHECK-LABEL: {{^}}test_legacy_interrupt: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT) +define void @test_legacy_interrupt() { +body: + call void @llvm.SI.sendmsg(i32 1, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_emit: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) +define void @test_legacy_gs_emit() { +body: + call void @llvm.SI.sendmsg(i32 34, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) +define void @test_legacy_gs_cut() { +body: + call void @llvm.SI.sendmsg(i32 274, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) +define void @test_legacy_gs_emit_cut() { +body: + call void @llvm.SI.sendmsg(i32 562, i32 0) + ret void +} + +; CHECK-LABEL: {{^}}test_legacy_gs_done: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define void @test_legacy_gs_done() { +body: + call void @llvm.SI.sendmsg(i32 3, i32 0) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 +declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll deleted file mode 100644 index 2d4987643a2b..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP) -; GCN-NEXT: s_endpgm - -define amdgpu_gs void @main(i32 inreg %a) #0 { - call void @llvm.SI.sendmsg(i32 3, i32 %a) - ret void -} - -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll deleted file mode 100644 index c4bb27676e7d..000000000000 --- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll +++ /dev/null @@ -1,24 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0) -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1) -; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2) -; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) - -define void @main() { -main_body: - call void @llvm.SI.sendmsg(i32 34, i32 0); - call void @llvm.SI.sendmsg(i32 274, i32 0); - call void @llvm.SI.sendmsg(i32 562, i32 0); - call void @llvm.SI.sendmsg(i32 3, i32 0); - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/PowerPC/ppc64-blnop.ll b/test/CodeGen/PowerPC/ppc64-blnop.ll new file mode 100644 index 000000000000..2fe23f91c83d --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-blnop.ll @@ -0,0 +1,129 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s +; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS +; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS + +%class.T = type { [2 x i8] } + +define void @e_callee(%class.T* %this, i8* %c) { ret void } +define void @e_caller(%class.T* %this, i8* %c) { + call void @e_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_caller: +; CHECK: bl e_callee +; CHECK-NEXT: nop + +; CHECK-FS-LABEL: e_caller: +; CHECK-FS: bl e_callee +; CHECK-FS-NEXT: nop +} + +define void @e_scallee(%class.T* %this, i8* %c) section "different" { ret void } +define void @e_scaller(%class.T* %this, i8* %c) { + call void @e_scallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_scaller: +; CHECK: bl e_scallee +; CHECK-NEXT: nop +} + +define void @e_s2callee(%class.T* %this, i8* %c) { ret void } +define void @e_s2caller(%class.T* %this, i8* %c) section "different" { + call void @e_s2callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_s2caller: +; CHECK: bl e_s2callee +; CHECK-NEXT: nop +} + +$cd1 = comdat any +$cd2 = comdat any + +define void @e_ccallee(%class.T* %this, i8* %c) comdat($cd1) { ret void } +define void @e_ccaller(%class.T* %this, i8* %c) comdat($cd2) { + call void @e_ccallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_ccaller: +; CHECK: bl e_ccallee +; CHECK-NEXT: nop +} + +$cd = comdat any + +define void @e_c1callee(%class.T* %this, i8* %c) comdat($cd) { ret void } +define void @e_c1caller(%class.T* %this, i8* %c) comdat($cd) { + call void @e_c1callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: e_c1caller: +; CHECK: bl e_c1callee +; CHECK-NEXT: nop +} + +define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_hcaller(%class.T* %this, i8* %c) { + call void @wo_hcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_hcaller: +; CHECK: bl wo_hcallee +; CHECK-NEXT: nop +} + +define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } +define void @wo_pcaller(%class.T* %this, i8* %c) { + call void @wo_pcallee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_pcaller: +; CHECK: bl wo_pcallee +; CHECK-NEXT: nop +} + +define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } +define void @wo_caller(%class.T* %this, i8* %c) { + call void @wo_callee(%class.T* %this, i8* %c) + ret void + +; CHECK-LABEL: wo_caller: +; CHECK: bl wo_callee +; CHECK-NEXT: nop +} + +define weak protected void @w_pcallee(i8* %ptr) { ret void } +define void @w_pcaller(i8* %ptr) { + call void @w_pcallee(i8* %ptr) + ret void + +; CHECK-LABEL: w_pcaller: +; CHECK: bl w_pcallee +; CHECK-NEXT: nop +} + +define weak hidden void @w_hcallee(i8* %ptr) { ret void } +define void @w_hcaller(i8* %ptr) { + call void @w_hcallee(i8* %ptr) + ret void + +; CHECK-LABEL: w_hcaller: +; CHECK: bl w_hcallee +; CHECK-NEXT: nop +} + +define weak void @w_callee(i8* %ptr) { ret void } +define void @w_caller(i8* %ptr) { + call void @w_callee(i8* %ptr) + ret void + +; CHECK-LABEL: w_caller: +; CHECK: bl w_callee +; CHECK-NEXT: nop +} + diff --git a/test/CodeGen/PowerPC/ppc64-sibcall.ll b/test/CodeGen/PowerPC/ppc64-sibcall.ll index 418b7828f1d9..59e545601475 100644 --- a/test/CodeGen/PowerPC/ppc64-sibcall.ll +++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll @@ -142,7 +142,7 @@ define void @wo_hcaller(%class.T* %this, i8* %c) { ret void ; CHECK-SCO-LABEL: wo_hcaller: -; CHECK-SCO: b wo_hcallee +; CHECK-SCO: bl wo_hcallee } define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void } @@ -151,7 +151,7 @@ define void @wo_pcaller(%class.T* %this, i8* %c) { ret void ; CHECK-SCO-LABEL: wo_pcaller: -; CHECK-SCO: b wo_pcallee +; CHECK-SCO: bl wo_pcallee } define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void } @@ -169,7 +169,7 @@ define void @w_pcaller(i8* %ptr) { ret void ; CHECK-SCO-LABEL: w_pcaller: -; CHECK-SCO: b w_pcallee +; CHECK-SCO: bl w_pcallee } define weak hidden void @w_hcallee(i8* %ptr) { ret void } @@ -178,7 +178,7 @@ define void @w_hcaller(i8* %ptr) { ret void ; CHECK-SCO-LABEL: w_hcaller: -; CHECK-SCO: b w_hcallee +; CHECK-SCO: bl w_hcallee } define weak void @w_callee(i8* %ptr) { ret void } diff --git a/test/CodeGen/SPARC/soft-float.ll b/test/CodeGen/SPARC/soft-float.ll index 53ca1974659e..582804444f3b 100644 --- a/test/CodeGen/SPARC/soft-float.ll +++ b/test/CodeGen/SPARC/soft-float.ll @@ -45,21 +45,21 @@ define fp128 @test_multf3(fp128 %a, fp128 %b) #0 { } define float @test_subsf3(float %a, float %b) #0 { - ; CHCEK-LABEL: test_subsf3: + ; CHECK-LABEL: test_subsf3: ; CHECK: call __subsf3 %sub = fsub float %a, %b ret float %sub } define double @test_subdf3(double %a, double %b) #0 { - ; CHCEK-LABEL: test_subdf3: + ; CHECK-LABEL: test_subdf3: ; CHECK: call __subdf3 %sub = fsub double %a, %b ret double %sub } define fp128 @test_subtf3(fp128 %a, fp128 %b) #0 { - ; CHCEK-LABEL: test_subtf3: + ; CHECK-LABEL: test_subtf3: ; CHECK: call __subtf3 %sub = fsub fp128 %a, %b ret fp128 %sub diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll index b50253bf2b03..4d7cb765d7b9 100644 --- a/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -371,6 +371,40 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { } ; Make sure that we merge the consecutive load/store sequence below and use a +; word (16 bit) instead of a byte copy for complicated address calculation. +; . +; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated: +; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] +; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] +; CHECK: movw %[[REG]], (%{{.*}}) +define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) { + br label %1 + +; <label>:1 + %.09 = phi i64 [ 0, %0 ], [ %13, %1 ] + %.08 = phi i8* [ %b, %0 ], [ %12, %1 ] + %2 = load i8, i8* %.08, align 1 + %3 = sext i8 %2 to i64 + %4 = getelementptr inbounds i8, i8* %c, i64 %3 + %5 = load i8, i8* %4, align 1 + %6 = add nsw i64 %3, 1 + %7 = getelementptr inbounds i8, i8* %c, i64 %6 + %8 = load i8, i8* %7, align 1 + %9 = getelementptr inbounds i8, i8* %a, i64 %.09 + store i8 %5, i8* %9, align 1 + %10 = or i64 %.09, 1 + %11 = getelementptr inbounds i8, i8* %a, i64 %10 + store i8 %8, i8* %11, align 1 + %12 = getelementptr inbounds i8, i8* %.08, i64 1 + %13 = add nuw nsw i64 %.09, 2 + %14 = icmp slt i64 %13, %n + br i1 %14, label %1, label %15 + +; <label>:15 + ret void +} + +; Make sure that we merge the consecutive load/store sequence below and use a ; word (16 bit) instead of a byte copy even if there are intermediate sign ; extensions. ; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext: diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 9b4d776b29e3..f65f485cc62c 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -209,34 +209,22 @@ entry: } define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp { -; X32-AVX2-LABEL: QQ64: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: movl (%eax), %ecx -; X32-AVX2-NEXT: movl 4(%eax), %eax -; X32-AVX2-NEXT: vmovd %ecx, %xmm0 -; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX2-NEXT: retl +; X32-LABEL: QQ64: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl ; ; X64-LABEL: QQ64: ; X64: ## BB#0: ## %entry ; X64-NEXT: vbroadcastsd (%rdi), %ymm0 ; X64-NEXT: retq -; -; X32-AVX512VL-LABEL: QQ64: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: movl (%eax), %ecx -; X32-AVX512VL-NEXT: movl 4(%eax), %eax -; X32-AVX512VL-NEXT: vmovd %ecx, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512VL-NEXT: retl entry: %q = load i64, i64* %ptr, align 4 %q0 = insertelement <4 x i64> undef, i64 %q, i32 0 @@ -1105,55 +1093,30 @@ define <4 x double> @splat_concat4(double %d) { ; Those test cases exerce the latter. define void @isel_crash_16b(i8* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_16b: -; X32-AVX2: ## BB#0: ## %eintry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi0: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: isel_crash_16b: -; X64-AVX2: ## BB#0: ## %eintry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movb (%rdi), %al -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: isel_crash_16b: -; X32-AVX512VL: ## BB#0: ## %eintry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi0: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: vpbroadcastb (%eax), %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl +; X32-LABEL: isel_crash_16b: +; X32: ## BB#0: ## %eintry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi0: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpbroadcastb (%eax), %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: isel_crash_16b: -; X64-AVX512VL: ## BB#0: ## %eintry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movb (%rdi), %al -; X64-AVX512VL-NEXT: vmovd %eax, %xmm1 -; X64-AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_16b: +; X64: ## BB#0: ## %eintry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq eintry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1277,55 +1240,30 @@ eintry: } define void @isel_crash_8w(i16* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_8w: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi4: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: isel_crash_8w: -; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movw (%rdi), %ax -; X64-AVX2-NEXT: vmovd %eax, %xmm1 -; X64-AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: isel_crash_8w: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi4: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: vpbroadcastw (%eax), %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl +; X32-LABEL: isel_crash_8w: +; X32: ## BB#0: ## %entry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi4: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: vpbroadcastw (%eax), %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: isel_crash_8w: -; X64-AVX512VL: ## BB#0: ## %entry -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: movw (%rdi), %ax -; X64-AVX512VL-NEXT: vmovd %eax, %xmm1 -; X64-AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX512VL-NEXT: retq +; X64-LABEL: isel_crash_8w: +; X64: ## BB#0: ## %entry +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1605,24 +1543,24 @@ eintry: } define void @isel_crash_2q(i64* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_2q: -; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: subl $60, %esp -; X32-AVX2-NEXT: Lcfi12: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX2-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX2-NEXT: movl (%eax), %ecx -; X32-AVX2-NEXT: movl 4(%eax), %eax -; X32-AVX2-NEXT: vmovd %ecx, %xmm1 -; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: addl $60, %esp -; X32-AVX2-NEXT: retl +; X32-LABEL: isel_crash_2q: +; X32: ## BB#0: ## %entry +; X32-NEXT: subl $60, %esp +; X32-NEXT: Lcfi12: +; X32-NEXT: .cfi_def_cfa_offset 64 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovaps %xmm0, (%esp) +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm1 +; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) +; X32-NEXT: addl $60, %esp +; X32-NEXT: retl ; ; X64-AVX2-LABEL: isel_crash_2q: ; X64-AVX2: ## BB#0: ## %entry @@ -1635,25 +1573,6 @@ define void @isel_crash_2q(i64* %cV_R.addr) { ; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-AVX2-NEXT: retq ; -; X32-AVX512VL-LABEL: isel_crash_2q: -; X32-AVX512VL: ## BB#0: ## %entry -; X32-AVX512VL-NEXT: subl $60, %esp -; X32-AVX512VL-NEXT: Lcfi12: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64 -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp) -; X32-AVX512VL-NEXT: movl (%eax), %ecx -; X32-AVX512VL-NEXT: movl 4(%eax), %eax -; X32-AVX512VL-NEXT: vmovd %ecx, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: addl $60, %esp -; X32-AVX512VL-NEXT: retl -; ; X64-AVX512VL-LABEL: isel_crash_2q: ; X64-AVX512VL: ## BB#0: ## %entry ; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 @@ -1752,7 +1671,7 @@ define void @isel_crash_4q(i64* %cV_R.addr) { ; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 ; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 ; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; X32-AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: movl %ebp, %esp diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll index 656b618eff55..87f8cc9a418e 100644 --- a/test/CodeGen/X86/avx512-any_extend_load.ll +++ b/test/CodeGen/X86/avx512-any_extend_load.ll @@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { define void @any_extend_load_v8i32(<8 x i8> * %ptr) { ; KNL-LABEL: any_extend_load_v8i32: ; KNL: # BB#0: -; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll index 9e8662452822..391bf6ba4554 100644 --- a/test/CodeGen/X86/avx512-extract-subvector.ll +++ b/test/CodeGen/X86/avx512-extract-subvector.ll @@ -60,7 +60,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind { define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f64_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi) +; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3> @@ -72,7 +72,7 @@ entry: define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8f32_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -84,7 +84,7 @@ entry: define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v4i64_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3> @@ -96,7 +96,7 @@ entry: define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v8i32_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -108,7 +108,7 @@ entry: define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v16i16_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> @@ -120,7 +120,7 @@ entry: define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp { ; SKX-LABEL: extract_subvector256_v32i8_store: ; SKX: ## BB#0: ## %entry -; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi) +; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index c6cc74289971..26d14fa0840f 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -463,7 +463,7 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { ; SKX-LABEL: extract_v4i64: ; SKX: ## BB#0: ; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) ; SKX-NEXT: retq %r1 = extractelement <4 x i64> %x, i32 1 @@ -521,7 +521,7 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { ; SKX-LABEL: extract_v8i32: ; SKX: ## BB#0: ; SKX-NEXT: vpextrd $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) ; SKX-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 @@ -582,7 +582,7 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { ; SKX-LABEL: extract_v16i16: ; SKX: ## BB#0: ; SKX-NEXT: vpextrw $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SKX-NEXT: retq @@ -646,7 +646,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { ; SKX-LABEL: extract_v32i8: ; SKX: ## BB#0: ; SKX-NEXT: vpextrb $1, %xmm0, %eax -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SKX-NEXT: retq @@ -714,9 +714,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 -; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 @@ -780,9 +780,9 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 @@ -846,9 +846,9 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 @@ -912,9 +912,9 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { ; SKX: ## BB#0: ; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 @@ -1014,9 +1014,9 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { ; ; SKX-LABEL: test_insert_128_v16i16: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %r = insertelement <16 x i16> %x, i16 %y, i32 10 ret <16 x i16> %r @@ -1032,9 +1032,9 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) { ; ; SKX-LABEL: test_insert_128_v32i8: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; SKX-NEXT: retq %r = insertelement <32 x i8> %x, i8 %y, i32 20 ret <32 x i8> %r diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index f422a0354988..3c649e18bc38 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2868,3 +2868,187 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask } declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextractf32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextracti64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 +; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vextracti32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) { +; CHECK-LABEL: test_vextractf64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) + +declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res2, %res3 + ret <16 x i32> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 5442693806f3..3015a2b499ff 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -1243,53 +1243,6 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone -define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextractf32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) - ret <4 x float> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) - -define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextracti64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti64x4 $2, %zmm1, %ymm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) - ret <4 x i64> %res -} - -declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) - -define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { -; CHECK-LABEL: test_maskz_vextracti32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) - ret <4 x i32> %res -} - -declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) - -define <4 x double> @test_vextractf64x4(<8 x double> %a) { -; CHECK-LABEL: test_vextractf64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: vextractf64x4 $2, %zmm0, %ymm0 -; CHECK-NEXT: retq - %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) - ret <4 x double> %res -} - -declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) - declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) @@ -3984,86 +3937,6 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<1 ret <16 x float> %res2 } -declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 -} - declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32) define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index a5bceb7670a0..2200f1159880 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -30,9 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) { ; CHECK: # BB#0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpmovm2q %k0, %zmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: vpmovq2m %zmm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll index 09c48ddf81a1..ad8a29cacd82 100644 --- a/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -237,7 +237,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: PR29088: @@ -245,7 +245,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BWVL-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; X64-AVX512BWVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: PR29088: @@ -253,7 +253,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQVL-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi) -; X64-AVX512DQVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 6fd111577440..7a9d7d7885ff 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -30,7 +30,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -79,7 +79,7 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -129,7 +129,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -178,7 +178,7 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01] +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll new file mode 100644 index 000000000000..f4cf22c5ed3a --- /dev/null +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s + +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 +; CHECK-NEXT: kmovb %edi, %k0 +; CHECK-NEXT: kshiftlb $7, %k0, %k1 +; CHECK-NEXT: kshiftrb $7, %k1, %k1 +; CHECK-NEXT: kshiftlb $6, %k0, %k0 +; CHECK-NEXT: kshiftrb $7, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 +; CHECK-NEXT: vpsrad $31, %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vandpd %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) + %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + %res3 = add <16 x i32> %res, %res1 + %res4 = add <16 x i32> %res3, %res2 + ret <16 x i32> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res2, %res3 + ret <8 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 5826bb6fad23..375d63264517 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -325,127 +325,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou ret <2 x double> %res2 } - -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: retq - %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - -declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 -} - -declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 -} - declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8) define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) { diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 9bf989df22a3..f8460bf880f9 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1560,3 +1560,62 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) + %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index eb9c6b64bcf6..3430c5715376 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -549,66 +549,6 @@ define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x f ret <8 x float> %res2 } -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 -} - -declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) - -define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01] -; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 -} - -declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01] -; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] -; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 -} - declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8) define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8d44af7b7a4c..c63d47d780d1 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4773,3 +4773,63 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, < ret <4 x float> %res4 } +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) + %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) + %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) + %res3 = add <8 x i32> %res, %res1 + %res4 = add <8 x i32> %res2, %res3 + ret <8 x i32> %res4 +} diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 94095f549e51..82014283246e 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3621,26 +3621,6 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ret <4 x i64> %res2 } -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { @@ -3709,47 +3689,6 @@ define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x ret <8 x float> %res2 } -declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xd9,0x01] -; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - -declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xd9,0x01] -; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] -; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - - %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 -} - declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) { diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll new file mode 100644 index 000000000000..ab797e04b400 --- /dev/null +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll @@ -0,0 +1,72 @@ +; Test ensuring debug intrinsics do not affect generated function prologue. +; +; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s + +@a = local_unnamed_addr global i64 0, align 8 + +define void @noDebug() { +entry: + %0 = load i64, i64* @a, align 8 + %1 = load i64, i64* @a, align 8 + %2 = load i64, i64* @a, align 8 + %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1) + %4 = extractvalue { i64, i1 } %3, 0 + %5 = tail call i64 @fn1(i64 %4, i64 %2) + tail call void (...) @printf() + tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5) + ret void +} + +; CHECK-LABEL: noDebug +; CHECK: addq $24, %rsp +; CHECK: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + + +define void @withDebug() !dbg !18 { +entry: + %0 = load i64, i64* @a, align 8 + %1 = load i64, i64* @a, align 8 + %2 = load i64, i64* @a, align 8 + %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1) + %4 = extractvalue { i64, i1 } %3, 0 + %5 = tail call i64 @fn1(i64 %4, i64 %2) + tail call void @llvm.dbg.value(metadata i64 %4, i64 0, metadata !23, metadata !33), !dbg !34 + tail call void @llvm.dbg.value(metadata i64 %5, i64 0, metadata !22, metadata !33), !dbg !35 + tail call void (...) @printf() + tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5) + ret void +} + +; CHECK-LABEL: withDebug +; CHECK: #DEBUG_VALUE: test:j <- %RBX +; CHECK-NEXT: addq $24, %rsp +; CHECK: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) +declare i64 @fn1(i64, i64) + +declare void @printf(...) + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + + +!llvm.dbg.cu = !{!1} +!llvm.module.flags = !{!15, !16} + +!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang version 4.0.0") +!2 = !DIFile(filename: "test.cpp", directory: "") +!11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +!15 = !{i32 2, !"Dwarf Version", i32 4} +!16 = !{i32 2, !"Debug Info Version", i32 3} +!18 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 5, unit: !1) +!22 = !DILocalVariable(name: "i", scope: !18, file: !2, line: 6, type: !11) +!23 = !DILocalVariable(name: "j", scope: !18, file: !2, line: 7, type: !11) +!33 = !DIExpression() +!34 = !DILocation(line: 7, column: 17, scope: !18) +!35 = !DILocation(line: 6, column: 8, scope: !18) +!36 = !DILocation(line: 9, column: 3, scope: !18) +!37 = !DILocation(line: 10, column: 10, scope: !18) diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll index 8614d1b4c6c3..e86d094ac341 100644 --- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll @@ -2,31 +2,56 @@ ; ; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s - -define i64 @noDebug(i64 %a) { +define i64 @fn1NoDebug(i64 %a) { %call = call i64 @fn(i64 %a, i64 0) ret i64 %call } -; CHECK-LABEL: noDebug +; CHECK-LABEL: fn1NoDebug ; CHECK: popq %rcx -; CHECK: ret - +; CHECK-NEXT: ret -define i64 @withDebug(i64 %a) !dbg !4 { +define i64 @fn1WithDebug(i64 %a) !dbg !4 { %call = call i64 @fn(i64 %a, i64 0) tail call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !5, metadata !6), !dbg !7 ret i64 %call } -; CHECK-LABEL: withDebug +; CHECK-LABEL: fn1WithDebug ; CHECK: popq %rcx -; CHECK: ret +; CHECK-NEXT: ret + +%struct.Buffer = type { i8, [63 x i8] } + +define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) { + ret void +} + +; CHECK-LABEL: fn2NoDebug +; CHECK: and +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: mov +; CHECK-NEXT: pop +; CHECK-NEXT: ret + +define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !4 { + call void @llvm.dbg.declare(metadata %struct.Buffer* %p1, metadata !5, metadata !6), !dbg !7 + ret void +} +; CHECK-LABEL: fn2WithDebug +; CHECK: and +; CHECK-NOT: add +; CHECK-NOT: sub +; CHECK: mov +; CHECK-NEXT: pop +; CHECK-NEXT: ret declare i64 @fn(i64, i64) declare void @llvm.dbg.value(metadata, i64, metadata, metadata) +declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2,!3} diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll index 8898551a9764..da92bdb55d7c 100644 --- a/test/CodeGen/X86/i64-to-float.ll +++ b/test/CodeGen/X86/i64-to-float.ll @@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind { define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_sitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_sitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_sitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_sitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq @@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind { define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind { ; X32-SSE-LABEL: mask_uitofp_4i64_4f32: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: mask_uitofp_4i64_4f32: ; X32-AVX: # BB#0: -; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-AVX-NEXT: vzeroupper ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: mask_uitofp_4i64_4f32: ; X64-SSE: # BB#0: -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mask_uitofp_4i64_4f32: ; X64-AVX: # BB#0: -; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index cba9a221f774..4e65b169c7e6 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -1009,7 +1009,7 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; ; SKX-LABEL: one_mask_bit_set3: ; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vmovq %xmm0, 16(%rdi) ; SKX-NEXT: retq call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>) @@ -1026,17 +1026,11 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512F-LABEL: one_mask_bit_set4: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi) -; AVX512F-NEXT: retq -; -; SKX-LABEL: one_mask_bit_set4: -; SKX: ## BB#0: -; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; SKX-NEXT: vmovhpd %xmm0, 24(%rdi) -; SKX-NEXT: retq +; AVX512-LABEL: one_mask_bit_set4: +; AVX512: ## BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX512-NEXT: retq call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>) ret void } @@ -1109,19 +1103,12 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_one_mask_bit_set3: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; SKX-LABEL: load_one_mask_bit_set3: -; SKX: ## BB#0: -; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: load_one_mask_bit_set3: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val) ret <4 x i64> %res } @@ -1136,19 +1123,12 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: load_one_mask_bit_set4: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; SKX-LABEL: load_one_mask_bit_set4: -; SKX: ## BB#0: -; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1 -; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; SKX-NEXT: retq +; AVX512-LABEL: load_one_mask_bit_set4: +; AVX512: ## BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val) ret <4 x double> %res } diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index 198a96df6b1f..c6ae85dda43a 100644 --- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -488,7 +488,7 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) { define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { ;CHECK-LABEL: stack_fold_extractf32x4 - ;CHECK: vextractf32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <4 x float> %1 @@ -496,7 +496,7 @@ define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) { define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_extractf64x2 - ;CHECK: vextractf64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> <i32 2, i32 3> %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() ret <2 x double> %1 @@ -504,7 +504,7 @@ define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { ;CHECK-LABEL: stack_fold_insertf32x4 - ;CHECK: vinsertf32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x float> %2 @@ -512,7 +512,7 @@ define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) { define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_insertf64x2 - ;CHECK: vinsertf64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x double> %2 diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index 6847595e9278..77afc49b2576 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -445,7 +445,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16 define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) { ;CHECK-LABEL: stack_fold_extracti32x4 - ;CHECK: vextracti32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ; add forces execution domain %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -455,7 +455,7 @@ define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) { define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) { ;CHECK-LABEL: stack_fold_extracti64x2 - ;CHECK: vextracti64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill + ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill ; add forces execution domain %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1> %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3> @@ -465,7 +465,7 @@ define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) { define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_inserti32x4 - ;CHECK: vinserti32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; add forces execution domain @@ -475,7 +475,7 @@ define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { ;CHECK-LABEL: stack_fold_inserti64x2 - ;CHECK: vinserti64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; add forces execution domain diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll index b4f7cb9e106d..7aa3f393bbed 100644 --- a/test/CodeGen/X86/subvector-broadcast.ll +++ b/test/CodeGen/X86/subvector-broadcast.ll @@ -832,7 +832,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: @@ -841,7 +841,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: @@ -850,7 +850,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: @@ -864,21 +864,21 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <2 x double>, <2 x double>* %p0 store <2 x double> %1, <2 x double>* %p1 @@ -896,32 +896,14 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512F: ## BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512BW: ## BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512BW-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: -; X32-AVX512DQ: ## BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 -; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X32-AVX512: ## BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 +; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) +; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: ; X64-AVX: ## BB#0: @@ -930,26 +912,12 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse: -; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %p0 store <2 x i64> %1, <2 x i64>* %p1 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> @@ -957,37 +925,21 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) } define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { -; X32-AVX-LABEL: test_broadcast_4f32_8f32_reuse: -; X32-AVX: ## BB#0: -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX-NEXT: retl -; -; X32-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: -; X32-AVX512: ## BB#0: -; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX512-NEXT: vmovaps (%ecx), %xmm0 -; X32-AVX512-NEXT: vmovaps %xmm0, (%eax) -; X32-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512-NEXT: retl -; -; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: -; X64-AVX: ## BB#0: -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX-NEXT: retq -; -; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: -; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX512-NEXT: vmovaps %xmm0, (%rsi) -; X64-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: retq +; X32-LABEL: test_broadcast_4f32_8f32_reuse: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 +; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_broadcast_4f32_8f32_reuse: +; X64: ## BB#0: +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vmovaps %xmm0, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %p0 store <4 x float> %1, <4 x float>* %p1 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> @@ -1010,7 +962,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: @@ -1024,7 +976,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) ; X64-AVX512: ## BB#0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x i32> %1, <4 x i32>* %p1 @@ -1048,7 +1000,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1057,7 +1009,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1066,7 +1018,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: @@ -1080,21 +1032,21 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p0 store <8 x i16> %1, <8 x i16>* %p1 @@ -1118,7 +1070,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1127,7 +1079,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1136,7 +1088,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: @@ -1150,21 +1102,21 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) ; X64-AVX512F: ## BB#0: ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: ; X64-AVX512BW: ## BB#0: ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: ; X64-AVX512DQ: ## BB#0: ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p0 store <16 x i8> %1, <16 x i8>* %p1 @@ -1194,7 +1146,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: @@ -1204,7 +1156,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: @@ -1214,7 +1166,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: @@ -1230,7 +1182,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: @@ -1238,7 +1190,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: @@ -1246,7 +1198,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -1349,6 +1301,44 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* @gb4 = global <8 x i64> zeroinitializer, align 8 define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { +; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X32-AVX1: ## BB#0: ## %entry +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X32-AVX1-NEXT: vmovups %ymm0, _ga4 +; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32 +; X32-AVX1-NEXT: vmovups %ymm1, _gb4 +; X32-AVX1-NEXT: vzeroupper +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X32-AVX2: ## BB#0: ## %entry +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vmovdqu %ymm0, _ga4 +; X32-AVX2-NEXT: vmovdqu %ymm2, _gb4+32 +; X32-AVX2-NEXT: vmovdqu %ymm1, _gb4 +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X32-AVX512: ## BB#0: ## %entry ; X32-AVX512-NEXT: vpaddq LCPI26_0, %ymm0, %ymm0 @@ -1359,6 +1349,45 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { ; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4 ; X32-AVX512-NEXT: retl ; +; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X64-AVX1: ## BB#0: ## %entry +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4] +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2] +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip) +; X64-AVX1-NEXT: vmovups %ymm2, _gb4+{{.*}}(%rip) +; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: +; X64-AVX2: ## BB#0: ## %entry +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip) +; X64-AVX2-NEXT: vmovdqu %ymm2, _gb4+{{.*}}(%rip) +; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX512: ## BB#0: ## %entry ; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] @@ -1383,6 +1412,20 @@ entry: @gb2 = global <8 x double> zeroinitializer, align 8 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { +; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: +; X32-AVX: ## BB#0: ## %entry +; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X32-AVX-NEXT: vmovupd %ymm0, _ga2 +; X32-AVX-NEXT: vmovupd %ymm2, _gb2+32 +; X32-AVX-NEXT: vmovupd %ymm1, _gb2 +; X32-AVX-NEXT: vzeroupper +; X32-AVX-NEXT: retl +; ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X32-AVX512: ## BB#0: ## %entry ; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] @@ -1394,6 +1437,20 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) ; X32-AVX512-NEXT: vmovupd %zmm1, _gb2 ; X32-AVX512-NEXT: retl ; +; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: +; X64-AVX: ## BB#0: ## %entry +; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 +; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 +; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 +; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip) +; X64-AVX-NEXT: vmovupd %ymm2, _gb2+{{.*}}(%rip) +; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip) +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq +; ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X64-AVX512: ## BB#0: ## %entry ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index 2ced6de6aebe..2ad20a89cf26 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -204,7 +204,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; ; AVX512VL-LABEL: fptosi_4f64_to_4i64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] @@ -217,7 +217,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_4f64_to_4i64: @@ -719,7 +719,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; ; AVX512VL-LABEL: fptoui_4f64_to_4i64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] @@ -732,7 +732,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f64_to_4i64: @@ -1097,7 +1097,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_4f32_to_4i64: @@ -1205,7 +1205,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptosi_8f32_to_4i64: @@ -1822,7 +1822,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f32_to_4i64: @@ -2000,7 +2000,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; AVX512VL-NEXT: vmovq %rcx, %xmm1 ; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_8f32_to_4i64: @@ -2409,125 +2409,29 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; SSE-NEXT: popq %r14 ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f128_to_4i32: -; VEX: # BB#0: -; VEX-NEXT: pushq %r14 -; VEX-NEXT: pushq %rbx -; VEX-NEXT: subq $24, %rsp -; VEX-NEXT: movq %rsi, %r14 -; VEX-NEXT: movq %rdi, %rbx -; VEX-NEXT: movq %rdx, %rdi -; VEX-NEXT: movq %rcx, %rsi -; VEX-NEXT: callq __fixtfdi -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; VEX-NEXT: movq %rbx, %rdi -; VEX-NEXT: movq %r14, %rsi -; VEX-NEXT: callq __fixtfdi -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; VEX-NEXT: # xmm0 = xmm0[0],mem[0] -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; VEX-NEXT: addq $24, %rsp -; VEX-NEXT: popq %rbx -; VEX-NEXT: popq %r14 -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f128_to_4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $24, %rsp -; AVX512F-NEXT: movq %rsi, %r14 -; AVX512F-NEXT: movq %rdi, %rbx -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rcx, %rsi -; AVX512F-NEXT: callq __fixtfdi -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: movq %rbx, %rdi -; AVX512F-NEXT: movq %r14, %rsi -; AVX512F-NEXT: callq __fixtfdi -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512F-NEXT: addq $24, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f128_to_4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: pushq %r14 -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $24, %rsp -; AVX512VL-NEXT: movq %rsi, %r14 -; AVX512VL-NEXT: movq %rdi, %rbx -; AVX512VL-NEXT: movq %rdx, %rdi -; AVX512VL-NEXT: movq %rcx, %rsi -; AVX512VL-NEXT: callq __fixtfdi -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: movq %rbx, %rdi -; AVX512VL-NEXT: movq %r14, %rsi -; AVX512VL-NEXT: callq __fixtfdi -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VL-NEXT: addq $24, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: popq %r14 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f128_to_4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: pushq %r14 -; AVX512DQ-NEXT: pushq %rbx -; AVX512DQ-NEXT: subq $24, %rsp -; AVX512DQ-NEXT: movq %rsi, %r14 -; AVX512DQ-NEXT: movq %rdi, %rbx -; AVX512DQ-NEXT: movq %rdx, %rdi -; AVX512DQ-NEXT: movq %rcx, %rsi -; AVX512DQ-NEXT: callq __fixtfdi -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512DQ-NEXT: movq %rbx, %rdi -; AVX512DQ-NEXT: movq %r14, %rsi -; AVX512DQ-NEXT: callq __fixtfdi -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512DQ-NEXT: addq $24, %rsp -; AVX512DQ-NEXT: popq %rbx -; AVX512DQ-NEXT: popq %r14 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f128_to_4i32: -; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: pushq %r14 -; AVX512VLDQ-NEXT: pushq %rbx -; AVX512VLDQ-NEXT: subq $24, %rsp -; AVX512VLDQ-NEXT: movq %rsi, %r14 -; AVX512VLDQ-NEXT: movq %rdi, %rbx -; AVX512VLDQ-NEXT: movq %rdx, %rdi -; AVX512VLDQ-NEXT: movq %rcx, %rsi -; AVX512VLDQ-NEXT: callq __fixtfdi -; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512VLDQ-NEXT: movq %rbx, %rdi -; AVX512VLDQ-NEXT: movq %r14, %rsi -; AVX512VLDQ-NEXT: callq __fixtfdi -; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512VLDQ-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VLDQ-NEXT: addq $24, %rsp -; AVX512VLDQ-NEXT: popq %rbx -; AVX512VLDQ-NEXT: popq %r14 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: fptosi_2f128_to_4i32: +; AVX: # BB#0: +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: movq %rsi, %r14 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rcx, %rsi +; AVX-NEXT: callq __fixtfdi +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: movq %r14, %rsi +; AVX-NEXT: callq __fixtfdi +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: retq %cvt = fptosi <2 x fp128> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x i32> %ext diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 58d7f7bf3d83..6a81cdc490fe 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -288,7 +288,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: sitofp_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -299,7 +299,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f64: @@ -821,7 +821,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -832,7 +832,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f64: @@ -1430,7 +1430,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2344,7 +2344,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2775,7 +2775,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -2786,7 +2786,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: @@ -3190,7 +3190,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -3201,7 +3201,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: @@ -3426,7 +3426,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3667,7 +3667,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: @@ -4013,7 +4013,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -4593,7 +4593,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 31eb2202a05e..5bf6fbeb6235 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -461,7 +461,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> @@ -757,7 +757,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; ; AVX512VL-LABEL: cvt_16i16_to_16f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm10 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: movq %rax, %rcx ; AVX512VL-NEXT: shrq $48, %rcx @@ -840,14 +840,14 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq %1 = bitcast <16 x i16> %a0 to <16 x half> @@ -1227,7 +1227,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -1491,14 +1491,14 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a0 @@ -1738,7 +1738,7 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x double> @@ -1929,7 +1929,7 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -2145,14 +2145,14 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> @@ -2350,7 +2350,7 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -2474,7 +2474,7 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -2643,14 +2643,14 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4 +; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 @@ -3182,7 +3182,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; AVX512VL-NEXT: orl %edx, %eax ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %rcx, %rax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovd %xmm1, %ecx @@ -3427,7 +3427,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -3458,7 +3458,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 @@ -3479,7 +3479,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> @@ -3958,7 +3958,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovd %xmm1, %r10d -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovd %xmm2, %r11d @@ -4191,9 +4191,9 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; ; AVX512VL-LABEL: store_cvt_16f32_to_16i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextractf32x4 $1, %ymm2, %xmm3 +; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4 ; AVX512VL-NEXT: vmovd %xmm4, %eax ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4 @@ -4422,7 +4422,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4572,7 +4572,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4726,7 +4726,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4969,7 +4969,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d ; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4994,7 +4994,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5188,7 +5188,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r14d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5357,7 +5357,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5528,7 +5528,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5775,7 +5775,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5787,7 +5787,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r12d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: callq __truncdfhf2 diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll index 3ad13e03dbde..c68395493023 100644 --- a/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/test/CodeGen/X86/vector-lzcnt-256.ll @@ -710,35 +710,20 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VLCD-LABEL: testv32i8: -; AVX512VLCD: ## BB#0: -; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VLCD-NEXT: retq -; -; AVX512CD-LABEL: testv32i8: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512CD-NEXT: retq +; AVX512-LABEL: testv32i8: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # BB#0: @@ -799,35 +784,20 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VLCD-LABEL: testv32i8u: -; AVX512VLCD: ## BB#0: -; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VLCD-NEXT: retq -; -; AVX512CD-LABEL: testv32i8u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 -; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512CD-NEXT: retq +; AVX512-LABEL: testv32i8u: +; AVX512: ## BB#0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm1, %zmm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index ba47740cbaf0..3c7fd8b51a02 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1789,25 +1789,15 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11> ret <16 x i16> %shuffle } @@ -1822,23 +1812,14 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9> ret <16 x i16> %shuffle } @@ -1885,23 +1866,14 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -1919,29 +1891,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -1957,25 +1917,15 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11> ret <16 x i16> %shuffle } @@ -1991,25 +1941,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15> ret <16 x i16> %shuffle } @@ -2026,27 +1966,16 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2062,25 +1991,15 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -2095,23 +2014,14 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2128,27 +2038,16 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13> ret <16 x i16> %shuffle } @@ -2164,25 +2063,15 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3] +; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15> ret <16 x i16> %shuffle } @@ -2210,12 +2099,12 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0 ; ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8> ret <16 x i16> %shuffle @@ -2232,25 +2121,15 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8> ret <16 x i16> %shuffle } @@ -2266,25 +2145,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8> ret <16 x i16> %shuffle } @@ -2300,25 +2169,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12> ret <16 x i16> %shuffle } @@ -2334,25 +2193,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8> ret <16 x i16> %shuffle } @@ -2368,25 +2217,15 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12> ret <16 x i16> %shuffle } @@ -2414,12 +2253,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11> ret <16 x i16> %shuffle @@ -2448,12 +2287,12 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11> ret <16 x i16> %shuffle @@ -2482,12 +2321,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1 ; ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13> ret <16 x i16> %shuffle @@ -2516,12 +2355,12 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11> ret <16 x i16> %shuffle @@ -2538,25 +2377,15 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2572,25 +2401,15 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2606,25 +2425,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2640,25 +2449,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -2675,27 +2474,16 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15> ret <16 x i16> %shuffle } @@ -2711,25 +2499,15 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2745,25 +2523,15 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2779,25 +2547,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15] +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -2848,13 +2606,13 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11> ret <16 x i16> %shuffle @@ -2926,7 +2684,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] @@ -2934,7 +2692,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11> ret <16 x i16> %shuffle @@ -2961,13 +2719,13 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1 ; ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3] ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15> ret <16 x i16> %shuffle @@ -2996,12 +2754,12 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13> ret <16 x i16> %shuffle @@ -3693,23 +3451,14 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12> ret <16 x i16> %shuffle } @@ -3809,23 +3558,14 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10> ret <16 x i16> %shuffle } @@ -4000,17 +3740,11 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: -; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19> ret <16 x i16> %shuffle } @@ -4023,19 +3757,12 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ret <16 x i16> %shuffle } @@ -4049,17 +3776,11 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -4091,19 +3812,12 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index d4ec55a85d8d..301e8079a5dc 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -741,17 +741,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> ret <32 x i8> %shuffle } @@ -1167,19 +1161,12 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> ret <32 x i8> %shuffle } @@ -1706,17 +1693,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> ret <32 x i8> %shuffle } @@ -1787,19 +1768,12 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX2: # BB#0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47> ret <32 x i8> %shuffle } @@ -2188,7 +2162,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_ ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> ret <32 x i8> %shuffle @@ -2203,17 +2177,11 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> ret <32 x i8> %shuffle } @@ -2277,7 +2245,7 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_ ; ; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 3ecfc29d0f01..7f978138719e 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -320,39 +320,19 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0145: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0145: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0145: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x double> %shuffle } define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_4501: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_4501: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_4501: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x double> %shuffle } @@ -367,23 +347,11 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_1054: -; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_1054: -; AVX2: # BB#0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_1054: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_1054: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> ret <4 x double> %shuffle } @@ -735,7 +703,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -808,7 +776,7 @@ define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0145: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i64> %shuffle @@ -852,7 +820,7 @@ define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_4501: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x i64> %shuffle @@ -948,7 +916,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_1054: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> @@ -1424,7 +1392,7 @@ define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) { ; ; AVX512VL-LABEL: concat_v4i64_0145_bc: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1> %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index d6e91ca25d75..cba15827d32c 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -753,17 +753,11 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_3210ba98: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_3210ba98: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_3210ba98: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> ret <8 x float> %shuffle } @@ -829,17 +823,11 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_ba983210: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_ba983210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinsertf64x2 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_ba983210: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0> ret <8 x float> %shuffle } @@ -863,17 +851,11 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu1111: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu1111: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1111: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1> ret <8 x float> %shuffle } @@ -885,17 +867,11 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_44444444: -; AVX2: # BB#0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_44444444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vbroadcastss %xmm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8f32_44444444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x float> %shuffle } @@ -910,33 +886,21 @@ define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu3210: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu3210: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu3210: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_uuuu1188: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_uuuu1188: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1188: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8> ret <8 x float> %shuffle } @@ -951,17 +915,11 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { -; AVX1OR2-LABEL: shuffle_v8f32_5555uuuu: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8f32_5555uuuu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v8f32_5555uuuu: +; ALL: # BB#0: +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x float> %shuffle } @@ -1041,17 +999,11 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00040000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00040000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00040000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1064,17 +1016,11 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00500000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00500000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00500000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1087,17 +1033,11 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_06000000: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_06000000: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i32> %shuffle } @@ -1142,17 +1082,11 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00112233: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00112233: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00112233: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x i32> %shuffle } @@ -1556,17 +1490,11 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00015444: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00015444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00015444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1577,17 +1505,11 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00204644: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00204644: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00204644: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1598,17 +1520,11 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_03004474: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_03004474: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_03004474: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x i32> %shuffle } @@ -1619,17 +1535,11 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10004444: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10004444: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10004444: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1640,17 +1550,11 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_22006446: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_22006446: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_22006446: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x i32> %shuffle } @@ -1661,17 +1565,11 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_33307474: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_33307474: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_33307474: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x i32> %shuffle } @@ -1682,17 +1580,11 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_32104567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_32104567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_32104567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1703,17 +1595,11 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00236744: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00236744: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00236744: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1724,17 +1610,11 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00226644: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00226644: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00226644: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1745,17 +1625,11 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_10324567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_10324567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_10324567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1766,17 +1640,11 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_11334567: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_11334567: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_11334567: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1787,17 +1655,11 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235467: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235467: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235467: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1808,17 +1670,11 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_01235466: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_01235466: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_01235466: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x i32> %shuffle } @@ -1829,17 +1685,11 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_002u6u44: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_002u6u44: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x i32> %shuffle } @@ -1850,17 +1700,11 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_00uu66uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_00uu66uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1871,17 +1715,11 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_103245uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_103245uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1892,17 +1730,11 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_1133uu67: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_1133uu67: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x i32> %shuffle } @@ -1913,17 +1745,11 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_0uu354uu: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_0uu354uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x i32> %shuffle } @@ -1934,17 +1760,11 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_uuu3uu66: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_uuu3uu66: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> +; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x i32> %shuffle } @@ -2038,17 +1858,11 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_3210ba98: -; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_3210ba98: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_3210ba98: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> ret <8 x i32> %shuffle } @@ -2185,17 +1999,11 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_uuuu1111: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_uuuu1111: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1> ret <8 x i32> %shuffle } @@ -2233,7 +2041,7 @@ define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; ; AVX512VL-LABEL: shuffle_v8i32_44444444: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> @@ -2247,17 +2055,11 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_5555uuuu: -; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_5555uuuu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 7f7c27af47b3..b951bf1c97ed 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW target triple = "x86_64-unknown-unknown" @@ -35,7 +35,7 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; ALL-NEXT: vxorps %zmm1, %zmm1, %zmm1 ; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16> @@ -82,7 +82,7 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f: ; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; ALL-NEXT: vxorps %zmm0, %zmm0, %zmm0 ; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31> @@ -262,8 +262,8 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovupd (%rsi), %zmm0 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vmovups (%rsi), %zmm0 +; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4 @@ -397,8 +397,8 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15 ret <16 x i32> %res } -define <16 x i32> @mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { -; ALL-LABEL: mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: ; ALL: # BB#0: ; ALL-NEXT: kmovw %edi, %k1 ; ALL-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] @@ -422,8 +422,8 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1 ret <16 x i32> %res } -define <16 x i32> @maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, i16 %mask) { -; ALL-LABEL: maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17: ; ALL: # BB#0: ; ALL-NEXT: kmovw %edi, %k1 ; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1] @@ -495,3 +495,55 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y ret <16 x i32> %res } + +define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru + ret <16 x float> %res +} + +define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru + ret <16 x i32> %res +} + +define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { +; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: +; ALL: # BB#0: +; ALL-NEXT: kmovw %edi, %k1 +; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %mask.cast = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru + ret <16 x i32> %res +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 625681dc294c..365ff3bf63d5 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2375,3 +2375,199 @@ define <8 x i64> @maskz_shuffle_v8i64_12345670(<8 x i64> %a, i8 %mask) { %res = select <8 x i1> %mask.cast, <8 x i64> %shuffle, <8 x i64> zeroinitializer ret <8 x i64> %res } + +define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_012389AB: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_012389AB: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_89AB0123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_89AB0123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01230123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01230123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_012389AB: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_012389AB: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_89AB0123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_89AB0123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01230123: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01230123: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_89234567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_89234567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01894567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01894567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01238967: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01238967: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64_01234589: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64_01234589: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_89234567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_89234567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01894567: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01894567: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01238967: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01238967: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01234589: +; AVX512F: # BB#0: +; AVX512F-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01234589: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> + ret <8 x i64> %shuffle +} diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index 1dcfd3223c86..f828ed0ba6e7 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -419,40 +419,31 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -462,52 +453,39 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm0, %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: paddw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -515,14 +493,14 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -532,41 +510,38 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -576,17 +551,6 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: paddq %xmm8, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm3 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm4 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm5 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm6 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -603,50 +567,37 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm4 -; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -666,37 +617,35 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -706,10 +655,6 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE-NEXT: paddd {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -718,31 +663,27 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -752,13 +693,14 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -768,56 +710,54 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddw {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_add_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1676,69 +1616,39 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX1-LABEL: trunc_mul_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: @@ -1757,46 +1667,17 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm0, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: paddq %xmm8, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -1808,111 +1689,68 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v8i64_v8i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: @@ -2186,104 +2024,60 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-LABEL: trunc_mul_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8 -; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9 -; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8 -; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm4, %zmm4 -; AVX512BW-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512BW-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -2479,70 +2273,25 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> ret <4 x i32> %2 @@ -2551,36 +2300,6 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2592,64 +2311,28 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: movl $1, %eax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5] -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7] -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -2657,37 +2340,15 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512DQ-NEXT: retq +; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> ret <8 x i16> %2 @@ -2696,55 +2357,38 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2907,34 +2551,12 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7] -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3] -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15] -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11] -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -2943,8 +2565,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] @@ -2955,50 +2579,30 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: retq @@ -3073,15 +2677,15 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vzeroupper @@ -3547,36 +3151,31 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3586,30 +3185,23 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: andpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -3620,13 +3212,12 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -3634,14 +3225,14 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3651,40 +3242,38 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3694,41 +3283,27 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm5 -; SSE-NEXT: pand {{.*}}(%rip), %xmm6 -; SSE-NEXT: pand {{.*}}(%rip), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: packuswb %xmm5, %xmm4 ; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -3749,15 +3324,12 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -3777,37 +3349,35 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3817,10 +3387,6 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -3829,12 +3395,11 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -3845,13 +3410,12 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -3861,13 +3425,14 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3877,55 +3442,54 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_and_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4323,36 +3887,31 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4362,30 +3921,23 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm0, %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -4396,13 +3948,12 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -4410,14 +3961,14 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4427,40 +3978,38 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4470,17 +4019,6 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: pxor %xmm8, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm4 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm5 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm6 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -4497,14 +4035,11 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -4525,15 +4060,12 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -4553,37 +4085,35 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4593,10 +4123,6 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -4605,12 +4131,11 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -4621,13 +4146,12 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4637,13 +4161,14 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4653,55 +4178,54 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5099,36 +4623,31 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v4i64_v4i32: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: orps {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v4i64_v4i32: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> %2 = trunc <4 x i64> %1 to <4 x i32> @@ -5138,30 +4657,23 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i64_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: orpd {{.*}}(%rip), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] @@ -5172,13 +4684,12 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] @@ -5186,14 +4697,14 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i64_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> %2 = trunc <8 x i64> %1 to <8 x i16> @@ -5203,40 +4714,38 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i32_v8i16: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i32_v8i16: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v8i32_v8i16: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i32_v8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %2 = trunc <8 x i32> %1 to <8 x i16> @@ -5246,17 +4755,6 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i64_v16i8: ; SSE: # BB#0: -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: movd %rax, %xmm8 -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 -; SSE-NEXT: por {{.*}}(%rip), %xmm4 -; SSE-NEXT: por {{.*}}(%rip), %xmm5 -; SSE-NEXT: por {{.*}}(%rip), %xmm6 -; SSE-NEXT: por {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -5273,14 +4771,11 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 @@ -5301,15 +4796,12 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] @@ -5329,37 +4821,35 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> %2 = trunc <16 x i64> %1 to <16 x i8> @@ -5369,10 +4859,6 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i32_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm2 -; SSE-NEXT: por {{.*}}(%rip), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -5381,12 +4867,11 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 @@ -5397,13 +4882,12 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i32_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -5413,13 +4897,14 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v16i32_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %2 = trunc <16 x i32> %1 to <16 x i8> @@ -5429,55 +4914,54 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v16i16_v16i8: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v16i16_v16i8: ; AVX1: # BB#0: -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_or_const_v16i16_v16i8: ; AVX2: # BB#0: -; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> %2 = trunc <16 x i16> %1 to <16 x i8> @@ -5488,49 +4972,204 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { ; complex patterns - often created by vectorizer ; -define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { -; SSE-LABEL: mul_add_v4i64_v4i32: +define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_const_v4i64_v4i32: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm5 ; SSE-NEXT: paddq %xmm4, %xmm5 ; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 ; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 ; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: mul_add_const_v4i64_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_add_const_v4i64_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: mul_add_const_v4i64_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mul_add_const_v4i64_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_self_v4i64_v4i32: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrlq $32, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm0, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: paddq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE-NEXT: paddd %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: mul_add_self_v4i64_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mul_add_self_v4i64_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: mul_add_self_v4i64_v4i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mul_add_self_v4i64_v4i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = sext <4 x i32> %a1 to <4 x i64> + %3 = mul <4 x i64> %1, %2 + %4 = add <4 x i64> %3, %3 + %5 = trunc <4 x i64> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { +; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm2, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: psrlq $32, %xmm1 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm6, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm1, %xmm2 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: mul_add_v4i64_v4i32: +; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX1: # BB#0: ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] @@ -5538,58 +5177,58 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[0,2] ; AVX1-NEXT: retq ; -; AVX2-LABEL: mul_add_v4i64_v4i32: +; AVX2-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX2: # BB#0: ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: mul_add_v4i64_v4i32: +; AVX512F-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512F-NEXT: retq ; -; AVX512BW-LABEL: mul_add_v4i64_v4i32: +; AVX512BW-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512BW-NEXT: retq ; -; AVX512DQ-LABEL: mul_add_v4i64_v4i32: +; AVX512DQ-LABEL: mul_add_multiuse_v4i64_v4i32: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> ; AVX512DQ-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 - %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> + %4 = add <4 x i64> %1, %3 %5 = trunc <4 x i64> %4 to <4 x i32> ret <4 x i32> %5 } diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index 8a826e025a33..2571a21ce218 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -643,7 +643,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) ; AVX512VL-NEXT: retq ; @@ -701,7 +701,7 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; AVX512VL: # BB#0: # %entry ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2x4i64_8i32: @@ -717,7 +717,7 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i32> diff --git a/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll new file mode 100644 index 000000000000..a62def35acc5 --- /dev/null +++ b/test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll @@ -0,0 +1,70 @@ +; RUN: opt -simplifycfg -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Simplify CFG will try to sink the last instruction in a series of basic +; blocks, creating a "common" instruction in the successor block. If the +; debug locations of the commoned instructions have different file/line +; numbers the debug location of the common instruction should not be set. + +; Generated from source: + +; extern int foo(void); +; extern int bar(void); +; +; int test(int a, int b) { +; if(a) +; b -= foo(); +; else +; b -= bar(); +; return b; +; } + +; CHECK: define i32 @test +; CHECK-LABEL: if.end: +; CHECK: %[[PHI:.*]] = phi i32 [ %call1, %if.else ], [ %call, %if.then ] +; CHECK: sub nsw i32 %b, %[[PHI]] +; CHECK-NOT: !dbg +; CHECK: ret i32 + +define i32 @test(i32 %a, i32 %b) !dbg !6 { +entry: + %tobool = icmp ne i32 %a, 0, !dbg !8 + br i1 %tobool, label %if.then, label %if.else, !dbg !8 + +if.then: ; preds = %entry + %call = call i32 @foo(), !dbg !9 + %sub = sub nsw i32 %b, %call, !dbg !10 + br label %if.end, !dbg !11 + +if.else: ; preds = %entry + %call1 = call i32 @bar(), !dbg !12 + %sub2 = sub nsw i32 %b, %call1, !dbg !13 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %b.addr.0 = phi i32 [ %sub, %if.then ], [ %sub2, %if.else ] + ret i32 %b.addr.0, !dbg !14 +} + +declare i32 @foo() +declare i32 @bar() + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2) +!1 = !DIFile(filename: "test.c", directory: "") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 8, type: !7, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 9, column: 6, scope: !6) +!9 = !DILocation(line: 10, column: 10, scope: !6) +!10 = !DILocation(line: 10, column: 7, scope: !6) +!11 = !DILocation(line: 10, column: 5, scope: !6) +!12 = !DILocation(line: 12, column: 10, scope: !6) +!13 = !DILocation(line: 12, column: 7, scope: !6) +!14 = !DILocation(line: 13, column: 3, scope: !6) diff --git a/test/DebugInfo/X86/dbg-value-frame-index.ll b/test/DebugInfo/X86/dbg-value-frame-index.ll new file mode 100644 index 000000000000..7b49aacfaefd --- /dev/null +++ b/test/DebugInfo/X86/dbg-value-frame-index.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s + +define i1 @test() !dbg !4 { +entry: + %end = alloca i64, align 8 + br label %while.cond + +while.cond: + call void @llvm.dbg.value(metadata i64* %end, i64 0, metadata !5, metadata !6), !dbg !7 + %call = call i1 @fn(i64* %end, i64* %end, i64* null, i8* null, i64 0, i64* null, i32* null, i8* null), !dbg !7 + br label %while.body + +while.body: + br i1 0, label %while.end, label %while.cond + +while.end: + ret i1 true +} + +; CHECK-LABEL: test +; CHECK: #DEBUG_VALUE: test:w <- [%RSP+8] + +declare i1 @fn(i64*, i64*, i64*, i8*, i64, i64*, i32*, i8*) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2,!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0", emissionKind: FullDebug) +!1 = !DIFile(filename: "test.c", directory: "/") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "test", type: !10, unit: !0) +!5 = !DILocalVariable(name: "w", scope: !4, type: !9) +!6 = !DIExpression(DW_OP_deref) +!7 = !DILocation(line: 210, column: 12, scope: !4) +!8 = !{!9} +!9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) +!10 = !DISubroutineType(types: !8) diff --git a/test/MC/ARM/coff-relocations.s b/test/MC/ARM/coff-relocations.s index 6ebae709f6cf..fa2d407bb8f3 100644 --- a/test/MC/ARM/coff-relocations.s +++ b/test/MC/ARM/coff-relocations.s @@ -89,7 +89,7 @@ secrel: @ CHECK-RELOCATION: Relocations [ @ CHECK-RELOCATION: Section (1) .text { -@ CHCEK-RELOCATION: 0x0 IMAGE_REL_ARM_BRANCH24T +@ CHECK-RELOCATION: 0x0 IMAGE_REL_ARM_BRANCH24T @ CHECK-RELOCATION: 0x4 IMAGE_REL_ARM_BRANCH20T @ CHECK-RELOCATION: 0x8 IMAGE_REL_ARM_BLX23T @ CHECK-RELOCATION: 0xC IMAGE_REL_ARM_MOV32T diff --git a/test/ThinLTO/X86/drop-debug-info.ll b/test/ThinLTO/X86/drop-debug-info.ll index f8ed8dff420d..a097d6bac98e 100644 --- a/test/ThinLTO/X86/drop-debug-info.ll +++ b/test/ThinLTO/X86/drop-debug-info.ll @@ -3,7 +3,7 @@ ; The imported module has out-of-date debug information, let's make sure we can ; drop them without crashing when materializing later. -; RUN: llvm-lto -thinlto-action=import %t.bc -thinlto-index=%t.index.bc -o - | llvm-dis -o - | FileCheck %s +; RUN: llvm-link %t.bc -summary-index=%t.index.bc -import=globalfunc:%p/Inputs/drop-debug-info.bc | llvm-dis -o - | FileCheck %s ; CHECK: define available_externally void @globalfunc ; CHECK-NOT: llvm.dbg.value @@ -17,4 +17,4 @@ entry: ret i32 0 } -declare void @globalfunc(...)
\ No newline at end of file +declare void @globalfunc(...) diff --git a/test/Transforms/Inline/inline-invoke-tail.ll b/test/Transforms/Inline/inline-invoke-tail.ll index c263fcea6009..5ae27bc0fe25 100644 --- a/test/Transforms/Inline/inline-invoke-tail.ll +++ b/test/Transforms/Inline/inline-invoke-tail.ll @@ -22,7 +22,7 @@ entry: ; CHECK-NOT: invoke ; CHECK-NOT: @foo ; CHECK-NOT: tail -; CHCEK: call void @llvm.memcpy.p0i8.p0i8.i32 +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32 ; CHECK: br invcont: diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll index 7c46257273a3..39a746ab310b 100644 --- a/test/Transforms/InstCombine/add.ll +++ b/test/Transforms/InstCombine/add.ll @@ -507,3 +507,15 @@ define i1 @test40(i32 %a, i32 %b) { %cmp = icmp eq i32 %add, %b ret i1 %cmp } + +define i64 @test41(i32 %a) { +; CHECK-LABEL: @test41( +; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 %a, 15 +; CHECK-NEXT: [[EXT:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 [[EXT]] +; + %add = add nuw i32 %a, 16 + %zext = zext i32 %add to i64 + %sub = add i64 %zext, -1 + ret i64 %sub +} diff --git a/test/Transforms/InstCombine/assume.ll b/test/Transforms/InstCombine/assume.ll index 2f9213820f2b..7987aa242319 100644 --- a/test/Transforms/InstCombine/assume.ll +++ b/test/Transforms/InstCombine/assume.ll @@ -188,41 +188,56 @@ entry: declare void @escape(i32* %a) -; Do we canonicalize a nonnull assumption on a load into -; metadata form? +; Canonicalize a nonnull assumption on a load into metadata form. + define i1 @nonnull1(i32** %a) { -entry: +; CHECK-LABEL: @nonnull1( +; CHECK-NEXT: [[LOAD:%.*]] = load i32*, i32** %a, align 8, !nonnull !0 +; CHECK-NEXT: tail call void @escape(i32* nonnull [[LOAD]]) +; CHECK-NEXT: ret i1 false +; %load = load i32*, i32** %a %cmp = icmp ne i32* %load, null tail call void @llvm.assume(i1 %cmp) tail call void @escape(i32* %load) %rval = icmp eq i32* %load, null ret i1 %rval - -; CHECK-LABEL: @nonnull1 -; CHECK: !nonnull -; CHECK-NOT: call void @llvm.assume -; CHECK: ret i1 false } ; Make sure the above canonicalization applies only ; to pointer types. Doing otherwise would be illegal. + define i1 @nonnull2(i32* %a) { -entry: +; CHECK-LABEL: @nonnull2( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* %a, align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LOAD]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[RVAL:%.*]] = icmp eq i32 [[LOAD]], 0 +; CHECK-NEXT: ret i1 [[RVAL]] +; %load = load i32, i32* %a %cmp = icmp ne i32 %load, 0 tail call void @llvm.assume(i1 %cmp) %rval = icmp eq i32 %load, 0 ret i1 %rval - -; CHECK-LABEL: @nonnull2 -; CHECK-NOT: !nonnull -; CHECK: call void @llvm.assume } ; Make sure the above canonicalization does not trigger ; if the assume is control dependent on something else + define i1 @nonnull3(i32** %a, i1 %control) { +; CHECK-LABEL: @nonnull3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load i32*, i32** %a, align 8 +; CHECK-NEXT: br i1 %control, label %taken, label %not_taken +; CHECK: taken: +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32* [[LOAD]], null +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[RVAL:%.*]] = icmp eq i32* [[LOAD]], null +; CHECK-NEXT: ret i1 [[RVAL]] +; CHECK: not_taken: +; CHECK-NEXT: ret i1 true +; entry: %load = load i32*, i32** %a %cmp = icmp ne i32* %load, null @@ -233,17 +248,21 @@ taken: ret i1 %rval not_taken: ret i1 true - -; CHECK-LABEL: @nonnull3 -; CHECK-NOT: !nonnull -; CHECK: call void @llvm.assume } ; Make sure the above canonicalization does not trigger -; if the path from the load to the assume is potentially +; if the path from the load to the assume is potentially ; interrupted by an exception being thrown + define i1 @nonnull4(i32** %a) { -entry: +; CHECK-LABEL: @nonnull4( +; CHECK-NEXT: [[LOAD:%.*]] = load i32*, i32** %a, align 8 +; CHECK-NEXT: tail call void @escape(i32* [[LOAD]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32* [[LOAD]], null +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[RVAL:%.*]] = icmp eq i32* [[LOAD]], null +; CHECK-NEXT: ret i1 [[RVAL]] +; %load = load i32*, i32** %a ;; This call may throw! tail call void @escape(i32* %load) @@ -251,15 +270,9 @@ entry: tail call void @llvm.assume(i1 %cmp) %rval = icmp eq i32* %load, null ret i1 %rval - -; CHECK-LABEL: @nonnull4 -; CHECK-NOT: !nonnull -; CHECK: call void @llvm.assume } - - attributes #0 = { nounwind uwtable } attributes #1 = { nounwind } diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll index 0479549bea3f..09bea5895aaf 100644 --- a/test/Transforms/InstCombine/fabs.ll +++ b/test/Transforms/InstCombine/fabs.ll @@ -98,3 +98,51 @@ define float @square_fabs_shrink_call2(float %x) { ; CHECK-NEXT: ret float %sq } +; CHECK-LABEL: @fabs_select_constant_negative_positive( +; CHECK: %fabs = select i1 %cmp, float 1.000000e+00, float 2.000000e+00 +; CHECK-NEXT: ret float %fabs +define float @fabs_select_constant_negative_positive(i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, float -1.0, float 2.0 + %fabs = call float @llvm.fabs.f32(float %select) + ret float %fabs +} + +; CHECK-LABEL: @fabs_select_constant_positive_negative( +; CHECK: %fabs = select i1 %cmp, float 1.000000e+00, float 2.000000e+00 +; CHECK-NEXT: ret float %fabs +define float @fabs_select_constant_positive_negative(i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, float 1.0, float -2.0 + %fabs = call float @llvm.fabs.f32(float %select) + ret float %fabs +} + +; CHECK-LABEL: @fabs_select_constant_negative_negative( +; CHECK: %fabs = select i1 %cmp, float 1.000000e+00, float 2.000000e+00 +; CHECK-NEXT: ret float %fabs +define float @fabs_select_constant_negative_negative(i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, float -1.0, float -2.0 + %fabs = call float @llvm.fabs.f32(float %select) + ret float %fabs +} + +; CHECK-LABEL: @fabs_select_constant_neg0( +; CHECK-NEXT: ret float 0.0 +define float @fabs_select_constant_neg0(i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, float -0.0, float 0.0 + %fabs = call float @llvm.fabs.f32(float %select) + ret float %fabs +} + +; CHECK-LABEL: @fabs_select_var_constant_negative( +; CHECK: %select = select i1 %cmp, float %x, float -1.000000e+00 +; CHECK: %fabs = call float @llvm.fabs.f32(float %select) +define float @fabs_select_var_constant_negative(i32 %c, float %x) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, float %x, float -1.0 + %fabs = call float @llvm.fabs.f32(float %select) + ret float %fabs +} diff --git a/test/Transforms/InstCombine/fma.ll b/test/Transforms/InstCombine/fma.ll new file mode 100644 index 000000000000..e41f1e7edd46 --- /dev/null +++ b/test/Transforms/InstCombine/fma.ll @@ -0,0 +1,203 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +declare float @llvm.fma.f32(float, float, float) #1 +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare float @llvm.fabs.f32(float) #1 + +@external = external global i32 + +; CHECK-LABEL: @fma_fneg_x_fneg_y( +; CHECK: %fma = call float @llvm.fma.f32(float %x, float %y, float %z) +define float @fma_fneg_x_fneg_y(float %x, float %y, float %z) { + %x.fneg = fsub float -0.0, %x + %y.fneg = fsub float -0.0, %y + %fma = call float @llvm.fma.f32(float %x.fneg, float %y.fneg, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fneg_x_fneg_y_fast( +; CHECK: %fma = call fast float @llvm.fma.f32(float %x, float %y, float %z) +define float @fma_fneg_x_fneg_y_fast(float %x, float %y, float %z) { + %x.fneg = fsub float -0.0, %x + %y.fneg = fsub float -0.0, %y + %fma = call fast float @llvm.fma.f32(float %x.fneg, float %y.fneg, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fneg_const_fneg_y( +; CHECK: %fma = call float @llvm.fma.f32(float %y, float bitcast (i32 ptrtoint (i32* @external to i32) to float), float %z) +define float @fma_fneg_const_fneg_y(float %y, float %z) { + %y.fneg = fsub float -0.0, %y + %fma = call float @llvm.fma.f32(float fsub (float -0.0, float bitcast (i32 ptrtoint (i32* @external to i32) to float)), float %y.fneg, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fneg_x_fneg_const( +; CHECK: %fma = call float @llvm.fma.f32(float %x, float bitcast (i32 ptrtoint (i32* @external to i32) to float), float %z) +define float @fma_fneg_x_fneg_const(float %x, float %z) { + %x.fneg = fsub float -0.0, %x + %fma = call float @llvm.fma.f32(float %x.fneg, float fsub (float -0.0, float bitcast (i32 ptrtoint (i32* @external to i32) to float)), float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fabs_x_fabs_y( +; CHECK: %x.fabs = call float @llvm.fabs.f32(float %x) +; CHECK: %y.fabs = call float @llvm.fabs.f32(float %y) +; CHECK: %fma = call float @llvm.fma.f32(float %x.fabs, float %y.fabs, float %z) +define float @fma_fabs_x_fabs_y(float %x, float %y, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %y.fabs = call float @llvm.fabs.f32(float %y) + %fma = call float @llvm.fma.f32(float %x.fabs, float %y.fabs, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fabs_x_fabs_x( +; CHECK: %fma = call float @llvm.fma.f32(float %x, float %x, float %z) +define float @fma_fabs_x_fabs_x(float %x, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %fma = call float @llvm.fma.f32(float %x.fabs, float %x.fabs, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_fabs_x_fabs_x_fast( +; CHECK: %fma = call fast float @llvm.fma.f32(float %x, float %x, float %z) +define float @fma_fabs_x_fabs_x_fast(float %x, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %fma = call fast float @llvm.fma.f32(float %x.fabs, float %x.fabs, float %z) + ret float %fma +} + +; CHECK-LABEL: @fmuladd_fneg_x_fneg_y( +; CHECK: %fmuladd = call float @llvm.fmuladd.f32(float %x, float %y, float %z) +define float @fmuladd_fneg_x_fneg_y(float %x, float %y, float %z) { + %x.fneg = fsub float -0.0, %x + %y.fneg = fsub float -0.0, %y + %fmuladd = call float @llvm.fmuladd.f32(float %x.fneg, float %y.fneg, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fneg_x_fneg_y_fast( +; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %y, float %z) +define float @fmuladd_fneg_x_fneg_y_fast(float %x, float %y, float %z) { + %x.fneg = fsub float -0.0, %x + %y.fneg = fsub float -0.0, %y + %fmuladd = call fast float @llvm.fmuladd.f32(float %x.fneg, float %y.fneg, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fneg_const_fneg_y( +; CHECK: %fmuladd = call float @llvm.fmuladd.f32(float %y, float bitcast (i32 ptrtoint (i32* @external to i32) to float), float %z) +define float @fmuladd_fneg_const_fneg_y(float %y, float %z) { + %y.fneg = fsub float -0.0, %y + %fmuladd = call float @llvm.fmuladd.f32(float fsub (float -0.0, float bitcast (i32 ptrtoint (i32* @external to i32) to float)), float %y.fneg, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fneg_x_fneg_const( +; CHECK: %fmuladd = call float @llvm.fmuladd.f32(float %x, float bitcast (i32 ptrtoint (i32* @external to i32) to float), float %z) +define float @fmuladd_fneg_x_fneg_const(float %x, float %z) { + %x.fneg = fsub float -0.0, %x + %fmuladd = call float @llvm.fmuladd.f32(float %x.fneg, float fsub (float -0.0, float bitcast (i32 ptrtoint (i32* @external to i32) to float)), float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fabs_x_fabs_y( +; CHECK: %x.fabs = call float @llvm.fabs.f32(float %x) +; CHECK: %y.fabs = call float @llvm.fabs.f32(float %y) +; CHECK: %fmuladd = call float @llvm.fmuladd.f32(float %x.fabs, float %y.fabs, float %z) +define float @fmuladd_fabs_x_fabs_y(float %x, float %y, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %y.fabs = call float @llvm.fabs.f32(float %y) + %fmuladd = call float @llvm.fmuladd.f32(float %x.fabs, float %y.fabs, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fabs_x_fabs_x( +; CHECK: %fmuladd = call float @llvm.fmuladd.f32(float %x, float %x, float %z) +define float @fmuladd_fabs_x_fabs_x(float %x, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %fmuladd = call float @llvm.fmuladd.f32(float %x.fabs, float %x.fabs, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fmuladd_fabs_x_fabs_x_fast( +; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %x, float %z) +define float @fmuladd_fabs_x_fabs_x_fast(float %x, float %z) { + %x.fabs = call float @llvm.fabs.f32(float %x) + %fmuladd = call fast float @llvm.fmuladd.f32(float %x.fabs, float %x.fabs, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fma_k_y_z( +; CHECK: %fma = call float @llvm.fma.f32(float %y, float 4.000000e+00, float %z) +define float @fma_k_y_z(float %y, float %z) { + %fma = call float @llvm.fma.f32(float 4.0, float %y, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_k_y_z_fast( +; CHECK: %fma = call fast float @llvm.fma.f32(float %y, float 4.000000e+00, float %z) +define float @fma_k_y_z_fast(float %y, float %z) { + %fma = call fast float @llvm.fma.f32(float 4.0, float %y, float %z) + ret float %fma +} + +; CHECK-LABEL: @fmuladd_k_y_z_fast( +; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %y, float 4.000000e+00, float %z) +define float @fmuladd_k_y_z_fast(float %y, float %z) { + %fmuladd = call fast float @llvm.fmuladd.f32(float 4.0, float %y, float %z) + ret float %fmuladd +} + +; CHECK-LABEL: @fma_1_y_z( +; CHECK: %fma = fadd float %y, %z +define float @fma_1_y_z(float %y, float %z) { + %fma = call float @llvm.fma.f32(float 1.0, float %y, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_x_1_z( +; CHECK: %fma = fadd float %x, %z +define float @fma_x_1_z(float %x, float %z) { + %fma = call float @llvm.fma.f32(float %x, float 1.0, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_x_1_z_v2f32( +; CHECK: %fma = fadd <2 x float> %x, %z +define <2 x float> @fma_x_1_z_v2f32(<2 x float> %x, <2 x float> %z) { + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> <float 1.0, float 1.0>, <2 x float> %z) + ret <2 x float> %fma +} + +; CHECK-LABEL: @fma_x_1_2_z_v2f32( +; CHECK: %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> <float 1.000000e+00, float 2.000000e+00>, <2 x float> %z) +define <2 x float> @fma_x_1_2_z_v2f32(<2 x float> %x, <2 x float> %z) { + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> <float 1.0, float 2.0>, <2 x float> %z) + ret <2 x float> %fma +} + +; CHECK-LABEL: @fma_x_1_z_fast( +; CHECK: %fma = fadd fast float %x, %z +define float @fma_x_1_z_fast(float %x, float %z) { + %fma = call fast float @llvm.fma.f32(float %x, float 1.0, float %z) + ret float %fma +} + +; CHECK-LABEL: @fma_1_1_z( +; CHECK: %fma = fadd float %z, 1.0 +define float @fma_1_1_z(float %z) { + %fma = call float @llvm.fma.f32(float 1.0, float 1.0, float %z) + ret float %fma +} + +; CHECK-LABEL: @fmuladd_x_1_z_fast( +; CHECK: %fmuladd = fadd fast float %x, %z +define float @fmuladd_x_1_z_fast(float %x, float %z) { + %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float 1.0, float %z) + ret float %fmuladd +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/Transforms/InstCombine/rem.ll b/test/Transforms/InstCombine/rem.ll index 74d42fa99bf2..89a741c90707 100644 --- a/test/Transforms/InstCombine/rem.ll +++ b/test/Transforms/InstCombine/rem.ll @@ -204,11 +204,11 @@ define i32 @test17(i32 %X) { define i32 @test18(i16 %x, i32 %y) { ; CHECK: @test18 -; CHECK-NEXT: [[AND:%.*]] = and i16 %x, 4 -; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[AND]] to i32 -; CHECK-NEXT: [[SHL:%.*]] = shl nuw nsw i32 [[EXT]], 3 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[SHL]], 63 -; CHECK-NEXT: [[REM:%.*]] = and i32 [[XOR]], %y +; CHECK-NEXT: [[SHL:%.*]] = shl i16 %x, 3 +; CHECK-NEXT: [[AND:%.*]] = and i16 [[SHL]], 32 +; CHECK-NEXT: [[XOR:%.*]] = xor i16 [[AND]], 63 +; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[XOR]] to i32 +; CHECK-NEXT: [[REM:%.*]] = and i32 [[EXT]], %y ; CHECK-NEXT: ret i32 [[REM]] %1 = and i16 %x, 4 %2 = icmp ne i16 %1, 0 diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll index dab212dc583d..c046a72110c2 100644 --- a/test/Transforms/InstCombine/shift.ll +++ b/test/Transforms/InstCombine/shift.ll @@ -1049,3 +1049,15 @@ define <2 x i65> @test_63(<2 x i64> %t) { %b = ashr <2 x i65> %sext, <i65 33, i65 33> ret <2 x i65> %b } + +define i64 @test_64(i32 %t) { +; CHECK-LABEL: @test_64( +; CHECK-NEXT: [[SHL:%.*]] = shl i32 %t, 8 +; CHECK-NEXT: [[EXT:%.*]] = zext i32 [[SHL]] to i64 +; CHECK-NEXT: ret i64 [[EXT]] + + %and = and i32 %t, 16777215 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 8 + ret i64 %shl +} diff --git a/test/Transforms/InstCombine/sink-zext.ll b/test/Transforms/InstCombine/sink-zext.ll new file mode 100644 index 000000000000..7764ca76cc9a --- /dev/null +++ b/test/Transforms/InstCombine/sink-zext.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @callee() + +define i64 @test1(i32 %V) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[CALL2:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[ADDCONV:%.*]] = add nuw nsw i32 [[CALL1]], [[CALL2]] +; CHECK-NEXT: [[ADD:%.*]] = zext i32 [[ADD:%.*]]conv to i64 +; CHECK-NEXT: ret i64 [[ADD]] +; + %call1 = call i32 @callee(), !range !0 + %call2 = call i32 @callee(), !range !0 + %zext1 = sext i32 %call1 to i64 + %zext2 = sext i32 %call2 to i64 + %add = add i64 %zext1, %zext2 + ret i64 %add +} + +define i64 @test2(i32 %V) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[CALL2:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CALL1]], [[CALL2]] +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 [[ZEXT1]] +; + %call1 = call i32 @callee(), !range !0 + %call2 = call i32 @callee(), !range !0 + %add = add i32 %call1, %call2 + %zext = sext i32 %add to i64 + ret i64 %zext +} + +define i64 @test3(i32 %V) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[CALL2:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[MULCONV:%.*]] = mul nuw nsw i32 [[CALL1]], [[CALL2]] +; CHECK-NEXT: [[ADD:%.*]] = zext i32 [[MULCONV]] to i64 +; CHECK-NEXT: ret i64 [[ADD]] +; + %call1 = call i32 @callee(), !range !0 + %call2 = call i32 @callee(), !range !0 + %zext1 = sext i32 %call1 to i64 + %zext2 = sext i32 %call2 to i64 + %add = mul i64 %zext1, %zext2 + ret i64 %add +} + +define i64 @test4(i32 %V) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[CALL2:%.*]] = call i32 @callee(), !range !0 +; CHECK-NEXT: [[ADD:%.*]] = mul nuw nsw i32 [[CALL1]], [[CALL2]] +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 [[ZEXT1]] +; + %call1 = call i32 @callee(), !range !0 + %call2 = call i32 @callee(), !range !0 + %add = mul i32 %call1, %call2 + %zext = sext i32 %add to i64 + ret i64 %zext +} + +!0 = !{ i32 0, i32 2000 } diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll index 4d584de9c6f7..270de2edf7ae 100644 --- a/test/Transforms/LoopIdiom/basic.ll +++ b/test/Transforms/LoopIdiom/basic.ll @@ -97,8 +97,7 @@ for.end: ; preds = %entry ; CHECK: ret void } - -;; TODO: We should be able to promote this memset. Not yet though. +; Make sure the first store in the loop is turned into a memset. define void @test4(i8* %Base) nounwind ssp { bb.nph: ; preds = %entry %Base100 = getelementptr i8, i8* %Base, i64 1000 @@ -118,9 +117,8 @@ for.body: ; preds = %bb.nph, %for.body for.end: ; preds = %for.body, %entry ret void -; CHECK-TODO-LABEL: @test4( -; CHECK-TODO: call void @llvm.memset.p0i8.i64(i8* %Base, i8 0, i64 100, i32 1, i1 false) -; CHECK-TODO-NOT: store +; CHECK-LABEL: @test4( +; CHECK: call void @llvm.memset.p0i8.i64(i8* %Base, i8 0, i64 100, i32 1, i1 false) } ; This can't be promoted: the memset is a store of a loop variant value. diff --git a/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/test/Transforms/LoopUnroll/peel-loop-pgo.ll index 2987b03c04d0..18309b0691fa 100644 --- a/test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ b/test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -43,5 +43,5 @@ for.end: ; preds = %for.cond.for.end_cr ;CHECK: !1 = !{!"branch_weights", i32 900, i32 101} ;CHECK: !2 = !{!"branch_weights", i32 540, i32 360} ;CHECK: !3 = !{!"branch_weights", i32 162, i32 378} -;CHECK: !4 = !{!"branch_weights", i32 560, i32 162} +;CHECK: !4 = !{!"branch_weights", i32 1399, i32 162} diff --git a/test/Transforms/NewGVN/equivalent-phi.ll b/test/Transforms/NewGVN/equivalent-phi.ll new file mode 100644 index 000000000000..2deeb760f27e --- /dev/null +++ b/test/Transforms/NewGVN/equivalent-phi.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +@global = common global [1024 x i32] zeroinitializer, align 16 + +;; We should be able to prove the equivalence of two of the phis, and then use that to eliminate +;; one set of indexing calculations and a load + +; Function Attrs: nounwind ssp uwtable +define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 { +; CHECK-LABEL: @bar( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label %bb3 +; CHECK: bb3: +; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP:%.*]]15, %bb17 ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ %arg2, %bb ], [ [[TMP18:%.*]], %bb17 ] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ 0, %bb ], [ [[TMP14:%.*]], %bb17 ] +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @global, i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP14]] = add nsw i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = add nsw i32 [[TMP]], %arg1 +; CHECK-NEXT: br label %bb17 +; CHECK: bb17: +; CHECK-NEXT: [[TMP18]] = add i32 [[TMP4]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP19]], label %bb3, label %bb20 +; CHECK: bb20: +; CHECK-NEXT: ret i32 [[TMP14]] +; +bb: + br label %bb3 + +bb3: ; preds = %bb17, %bb + %tmp = phi i32 [ %arg, %bb ], [ %tmp15, %bb17 ] + %tmp4 = phi i32 [ %arg2, %bb ], [ %tmp18, %bb17 ] + %tmp5 = phi i32 [ %arg, %bb ], [ %tmp16, %bb17 ] + %tmp6 = phi i32 [ 0, %bb ], [ %tmp14, %bb17 ] + %tmp7 = sext i32 %tmp to i64 + %tmp8 = getelementptr inbounds [1024 x i32], [1024 x i32]* @global, i64 0, i64 %tmp7 + %tmp9 = load i32, i32* %tmp8, align 4 + %tmp10 = add nsw i32 %tmp6, %tmp9 + %tmp11 = sext i32 %tmp5 to i64 + %tmp12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @global, i64 0, i64 %tmp11 + %tmp13 = load i32, i32* %tmp12, align 4 + %tmp14 = add nsw i32 %tmp10, %tmp13 + %tmp15 = add nsw i32 %tmp, %arg1 + %tmp16 = add nsw i32 %tmp5, %arg1 + br label %bb17 + +bb17: ; preds = %bb3 + %tmp18 = add i32 %tmp4, -1 + %tmp19 = icmp ne i32 %tmp4, 0 + br i1 %tmp19, label %bb3, label %bb20 + +bb20: ; preds = %bb17 + ret i32 %tmp14 +} + +attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"Apple LLVM version 8.0.0 (clang-800.0.42.1)"} diff --git a/test/Transforms/NewGVN/pr31483.ll b/test/Transforms/NewGVN/pr31483.ll new file mode 100644 index 000000000000..94b485a990b6 --- /dev/null +++ b/test/Transforms/NewGVN/pr31483.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" + +@global = external hidden unnamed_addr constant [11 x i8], align 1 +;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory +;; equivalence detection. In PR31483, we were deleting those blocks as unreachable +; Function Attrs: nounwind +define signext i32 @ham(i8* %arg, i8* %arg1) #0 { +; CHECK-LABEL: @ham( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = alloca i8*, align 8 +; CHECK-NEXT: store i8* %arg1, i8** [[TMP]], align 8 +; CHECK-NEXT: br label %bb2 +; CHECK: bb2: +; CHECK-NEXT: [[TMP3:%.*]] = phi i8* [ %arg, %bb ], [ %tmp7, %bb22 ] +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %bb6, label %bb23 +; CHECK: bb6: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: switch i32 [[TMP9]], label %bb22 [ +; CHECK-NEXT: i32 115, label %bb10 +; CHECK-NEXT: i32 105, label %bb16 +; CHECK-NEXT: i32 99, label %bb16 +; CHECK-NEXT: ] +; CHECK: bb10: +; CHECK-NEXT: [[TMP11:%.*]] = load i8*, i8** [[TMP]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP11]], i64 8 +; CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i8** +; CHECK-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = call signext i32 (i8*, ...) @zot(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @global, i32 0, i32 0), i8* [[TMP14]]) +; CHECK-NEXT: br label %bb22 +; CHECK: bb16: +; CHECK-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP17]], i64 8 +; CHECK-NEXT: store i8* [[TMP18]], i8** [[TMP]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP17]], i64 4 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32* +; CHECK-NEXT: br label %bb22 +; CHECK: bb22: +; CHECK-NEXT: br label %bb2 +; CHECK: bb23: +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8** [[TMP]] to i8* +; CHECK-NEXT: call void @llvm.va_end(i8* [[TMP24]]) +; CHECK-NEXT: ret i32 undef +; +bb: + %tmp = alloca i8*, align 8 + store i8* %arg1, i8** %tmp, align 8 + br label %bb2 + +bb2: ; preds = %bb22, %bb + %tmp3 = phi i8* [ %arg, %bb ], [ %tmp7, %bb22 ] + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = icmp ne i8 %tmp4, 0 + br i1 %tmp5, label %bb6, label %bb23 + +bb6: ; preds = %bb2 + %tmp7 = getelementptr inbounds i8, i8* %tmp3, i32 1 + %tmp8 = load i8, i8* %tmp3, align 1 + %tmp9 = zext i8 %tmp8 to i32 + switch i32 %tmp9, label %bb22 [ + i32 115, label %bb10 + i32 105, label %bb16 + i32 99, label %bb16 + ] + +bb10: ; preds = %bb6 + %tmp11 = load i8*, i8** %tmp, align 8 + %tmp12 = getelementptr inbounds i8, i8* %tmp11, i64 8 + store i8* %tmp12, i8** %tmp, align 8 + %tmp13 = bitcast i8* %tmp11 to i8** + %tmp14 = load i8*, i8** %tmp13, align 8 + %tmp15 = call signext i32 (i8*, ...) @zot(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @global, i32 0, i32 0), i8* %tmp14) + br label %bb22 + +bb16: ; preds = %bb6, %bb6 + %tmp17 = load i8*, i8** %tmp, align 8 + %tmp18 = getelementptr inbounds i8, i8* %tmp17, i64 8 + store i8* %tmp18, i8** %tmp, align 8 + %tmp19 = getelementptr inbounds i8, i8* %tmp17, i64 4 + %tmp20 = bitcast i8* %tmp19 to i32* + %tmp21 = load i32, i32* %tmp20, align 4 + br label %bb22 + +bb22: ; preds = %bb16, %bb10, %bb6 + br label %bb2 + +bb23: ; preds = %bb2 + %tmp24 = bitcast i8** %tmp to i8* + call void @llvm.va_end(i8* %tmp24) + ret i32 undef +} + +declare signext i32 @zot(i8*, ...) #1 + +; Function Attrs: nounwind +declare void @llvm.va_end(i8*) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + diff --git a/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll b/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll new file mode 100644 index 000000000000..0011134640c3 --- /dev/null +++ b/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; RUN: opt -S -passes=partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define float @f(float %val) { +; CHECK: @f +; CHECK: entry: +; CHECK-NEXT: %[[RES:.+]] = tail call float @sqrtf(float %val) #0 +; CHECK-NEXT: %[[CMP:.+]] = fcmp oeq float %[[RES]], %[[RES]] +; CHECK-NEXT: br i1 %[[CMP]], label %[[EXIT:.+]], label %[[CALL:.+]] +; CHECK: [[CALL]]: +; CHECK-NEXT: %[[RES2:.+]] = tail call float @sqrtf(float %val){{$}} +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: %[[RET:.+]] = phi float [ %[[RES]], %entry ], [ %[[RES2]], %[[CALL]] ] +; CHECK-NEXT: ret float %[[RET]] +entry: + %res = tail call float @sqrtf(float %val) + ret float %res +} + +declare float @sqrtf(float) diff --git a/test/Transforms/PartiallyInlineLibCalls/X86/lit.local.cfg b/test/Transforms/PartiallyInlineLibCalls/X86/lit.local.cfg new file mode 100644 index 000000000000..afde89be896d --- /dev/null +++ b/test/Transforms/PartiallyInlineLibCalls/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index faae28f4fc64..07064304bf01 100644 --- a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -8,7 +8,8 @@ define float @baz() { ; CHECK-LABEL: @baz( -; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 @@ -64,7 +65,8 @@ entry: define float @bazz() { ; CHECK-LABEL: @bazz( -; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 @@ -146,7 +148,8 @@ entry: define float @bazzz() { ; CHECK-LABEL: @bazzz( -; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 @@ -188,7 +191,8 @@ entry: define i32 @foo() { ; CHECK-LABEL: @foo( -; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 @@ -232,7 +236,8 @@ entry: define float @bar() { ; CHECK-LABEL: @bar( -; CHECK: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 diff --git a/test/tools/gold/X86/Inputs/thinlto.ll b/test/tools/gold/X86/Inputs/thinlto.ll index b81de922b4da..31c72ec4653a 100644 --- a/test/tools/gold/X86/Inputs/thinlto.ll +++ b/test/tools/gold/X86/Inputs/thinlto.ll @@ -1,4 +1,5 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" define void @g() { entry: diff --git a/test/tools/gold/X86/Inputs/thinlto_archive1.ll b/test/tools/gold/X86/Inputs/thinlto_archive1.ll index b81de922b4da..31c72ec4653a 100644 --- a/test/tools/gold/X86/Inputs/thinlto_archive1.ll +++ b/test/tools/gold/X86/Inputs/thinlto_archive1.ll @@ -1,4 +1,5 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" define void @g() { entry: diff --git a/test/tools/gold/X86/Inputs/thinlto_archive2.ll b/test/tools/gold/X86/Inputs/thinlto_archive2.ll index c2bda1712a40..2136ec3471d1 100644 --- a/test/tools/gold/X86/Inputs/thinlto_archive2.ll +++ b/test/tools/gold/X86/Inputs/thinlto_archive2.ll @@ -1,4 +1,5 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" define void @h() { entry: diff --git a/test/tools/gold/X86/comdat.ll b/test/tools/gold/X86/comdat.ll index 8ed520c17c05..7cb1855df2d5 100644 --- a/test/tools/gold/X86/comdat.ll +++ b/test/tools/gold/X86/comdat.ll @@ -1,11 +1,13 @@ ; RUN: llvm-as %s -o %t1.o ; RUN: llvm-as %p/Inputs/comdat.ll -o %t2.o ; RUN: %gold -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t1.o %t2.o \ +; RUN: -m elf_x86_64 \ ; RUN: -plugin-opt=save-temps ; RUN: FileCheck --check-prefix=RES %s < %t3.o.resolution.txt ; RUN: llvm-readobj -t %t3.o | FileCheck --check-prefix=OBJ %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" $c1 = comdat any diff --git a/test/tools/gold/X86/opt-level.ll b/test/tools/gold/X86/opt-level.ll index d072866f7dba..a48c551a9aed 100644 --- a/test/tools/gold/X86/opt-level.ll +++ b/test/tools/gold/X86/opt-level.ll @@ -1,11 +1,14 @@ ; RUN: llvm-as -o %t.bc %s ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -plugin-opt=save-temps \ +; RUN: -m elf_x86_64 \ ; RUN: -plugin-opt=O0 -r -o %t.o %t.bc ; RUN: llvm-dis < %t.o.0.4.opt.bc -o - | FileCheck --check-prefix=CHECK-O0 %s ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -plugin-opt=save-temps \ +; RUN: -m elf_x86_64 \ ; RUN: -plugin-opt=O1 -r -o %t.o %t.bc ; RUN: llvm-dis < %t.o.0.4.opt.bc -o - | FileCheck --check-prefix=CHECK-O1 %s ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -plugin-opt=save-temps \ +; RUN: -m elf_x86_64 \ ; RUN: -plugin-opt=O2 -r -o %t.o %t.bc ; RUN: llvm-dis < %t.o.0.4.opt.bc -o - | FileCheck --check-prefix=CHECK-O2 %s @@ -14,6 +17,7 @@ ; CHECK-O2-NOT: define internal void @foo( target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" define internal void @foo() { ret void diff --git a/test/tools/gold/X86/pr25907.ll b/test/tools/gold/X86/pr25907.ll index f33f2f242458..bfdf4fc90497 100644 --- a/test/tools/gold/X86/pr25907.ll +++ b/test/tools/gold/X86/pr25907.ll @@ -1,10 +1,12 @@ ; RUN: llvm-as %s -o %t.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: -shared %t.o -o %t2 ; RUN: llvm-nm %t2 | FileCheck %s ; CHECK: T main target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" @main.L = internal unnamed_addr constant [3 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* null], align 16 diff --git a/test/tools/gold/X86/stats.ll b/test/tools/gold/X86/stats.ll index d278610789fa..15aa080d6fc0 100644 --- a/test/tools/gold/X86/stats.ll +++ b/test/tools/gold/X86/stats.ll @@ -2,6 +2,7 @@ ; RUN: llvm-as %s -o %t.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so -shared \ +; RUN: -m elf_x86_64 \ ; RUN: -plugin-opt=-stats %t.o -o %t2 2>&1 | FileCheck %s ; CHECK: Statistics Collected diff --git a/test/tools/gold/X86/strip_names.ll b/test/tools/gold/X86/strip_names.ll index bb974c8aebeb..dd4a94f83d66 100644 --- a/test/tools/gold/X86/strip_names.ll +++ b/test/tools/gold/X86/strip_names.ll @@ -1,11 +1,13 @@ ; RUN: llvm-as %s -o %t.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: -shared %t.o -o %t2.o ; RUN: llvm-dis %t2.o.0.2.internalize.bc -o - | FileCheck %s ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=emit-llvm \ ; RUN: -shared %t.o -o %t2.o ; RUN: llvm-dis %t2.o -o - | FileCheck ---check-prefix=NONAME %s @@ -25,6 +27,7 @@ ; NONAME: ret i32 %3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" @GlobalValueName = global i32 0 diff --git a/test/tools/gold/X86/thinlto.ll b/test/tools/gold/X86/thinlto.ll index 9ce070a6b5e6..aee7268dfb96 100644 --- a/test/tools/gold/X86/thinlto.ll +++ b/test/tools/gold/X86/thinlto.ll @@ -3,11 +3,13 @@ ; RUN: llvm-as %s -o %t.o ; RUN: llvm-as %p/Inputs/thinlto.ll -o %t2.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=thinlto-index-only \ ; RUN: -shared %t.o %t2.o -o %t3 ; RUN: not test -e %t3 ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: -shared %t.o %t2.o -o %t4 ; RUN: llvm-nm %t4 | FileCheck %s --check-prefix=NM @@ -18,6 +20,7 @@ ; Ensure gold generates an index and not a binary if requested. ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=thinlto-index-only \ ; RUN: -shared %t.o %t2.o -o %t3 @@ -28,6 +31,7 @@ ; Ensure gold generates an index as well as a binary with save-temps in ThinLTO mode. ; First force single-threaded mode ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=jobs=1 \ @@ -37,6 +41,7 @@ ; Check with --no-map-whole-files ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=jobs=1 \ @@ -47,6 +52,7 @@ ; Next force multi-threaded mode ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=jobs=2 \ @@ -56,6 +62,7 @@ ; Test --plugin-opt=obj-path to ensure unique object files generated. ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=jobs=2 \ ; RUN: --plugin-opt=obj-path=%t5.o \ @@ -116,6 +123,7 @@ ; COMBINED-NEXT: </VALUE_SYMTAB target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" declare void @g(...) diff --git a/test/tools/gold/X86/thinlto_afdo.ll b/test/tools/gold/X86/thinlto_afdo.ll index 083f89d77402..617f9f87a917 100644 --- a/test/tools/gold/X86/thinlto_afdo.ll +++ b/test/tools/gold/X86/thinlto_afdo.ll @@ -4,6 +4,7 @@ ; RUN: rm -f %t1.o.4.opt.bc ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=save-temps \ ; RUN: --plugin-opt=sample-profile=%p/Inputs/afdo.prof \ @@ -12,6 +13,7 @@ ; RUN: opt -S %t1.o.4.opt.bc | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" ; CHECK: ProfileSummary declare void @g(...) diff --git a/test/tools/gold/X86/thinlto_archive.ll b/test/tools/gold/X86/thinlto_archive.ll index c2ae679dfb03..13038b4fb60e 100644 --- a/test/tools/gold/X86/thinlto_archive.ll +++ b/test/tools/gold/X86/thinlto_archive.ll @@ -9,6 +9,7 @@ ; Test importing from archive library via gold, using jobs=1 to ensure ; output messages are not interleaved. ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=thinlto \ ; RUN: --plugin-opt=-print-imports \ ; RUN: --plugin-opt=jobs=1 \ @@ -16,6 +17,7 @@ ; RUN: llvm-nm %t4 | FileCheck %s --check-prefix=NM target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" ; CHECK-DAG: Import g declare void @g(...) diff --git a/test/tools/gold/X86/type-merge2.ll b/test/tools/gold/X86/type-merge2.ll index d020336ca7f5..439abd9c2b94 100644 --- a/test/tools/gold/X86/type-merge2.ll +++ b/test/tools/gold/X86/type-merge2.ll @@ -1,11 +1,13 @@ ; RUN: llvm-as %s -o %t.o ; RUN: llvm-as %p/Inputs/type-merge2.ll -o %t2.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: -shared %t.o %t2.o -o %t3.o ; RUN: llvm-dis %t3.o.0.2.internalize.bc -o - | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" %zed = type { i8 } define void @foo() { diff --git a/test/tools/gold/X86/visibility.ll b/test/tools/gold/X86/visibility.ll index f63bdbd2c959..1c70ebf5c467 100644 --- a/test/tools/gold/X86/visibility.ll +++ b/test/tools/gold/X86/visibility.ll @@ -2,6 +2,7 @@ ; RUN: llvm-as %p/Inputs/visibility.ll -o %t2.o ; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \ +; RUN: -m elf_x86_64 \ ; RUN: --plugin-opt=save-temps \ ; RUN: -shared %t.o %t2.o -o %t.so ; RUN: llvm-readobj -t %t.so | FileCheck %s @@ -19,6 +20,7 @@ ; IR: define void @foo target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" define weak protected void @foo() { ret void |